mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-21 04:12:55 +00:00
Compare commits
339 Commits
compute_no
...
two_phase_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e9c7665c81 | ||
|
|
eb1f1a347d | ||
|
|
064aa44a06 | ||
|
|
d6ee61b5cf | ||
|
|
4b78a16b82 | ||
|
|
c093ee5e4b | ||
|
|
7685372cae | ||
|
|
ce54133ec4 | ||
|
|
610e14a7fc | ||
|
|
35a1c3d521 | ||
|
|
22b7e74c83 | ||
|
|
d95e1da742 | ||
|
|
40d047c146 | ||
|
|
42f3dd47d2 | ||
|
|
c2b2ab974c | ||
|
|
6ad6e5bd84 | ||
|
|
d534aeb9e1 | ||
|
|
d45839879c | ||
|
|
1f6ca23db6 | ||
|
|
2127a65e27 | ||
|
|
ecf2d181c4 | ||
|
|
c1bfa32771 | ||
|
|
8465738aa5 | ||
|
|
87d7ce816d | ||
|
|
f38c2e620e | ||
|
|
86056abd0e | ||
|
|
2bf2dd1d88 | ||
|
|
874d82fd4c | ||
|
|
3645133700 | ||
|
|
20b6279beb | ||
|
|
06f96f9600 | ||
|
|
b5f60f3874 | ||
|
|
0ec56cd21f | ||
|
|
600e1a0080 | ||
|
|
9c94a34ae7 | ||
|
|
9c0ac251df | ||
|
|
872ed24408 | ||
|
|
2f25d17e11 | ||
|
|
8faa6fa392 | ||
|
|
4d5a41301d | ||
|
|
4c35b22626 | ||
|
|
9fe3b73e13 | ||
|
|
e0146304e6 | ||
|
|
fbb04c592a | ||
|
|
8f43d7637c | ||
|
|
cf30303d8f | ||
|
|
1ec157653e | ||
|
|
858ca3a4ce | ||
|
|
d744ddee7c | ||
|
|
3296b7d770 | ||
|
|
2148ae78ab | ||
|
|
78dcf2207e | ||
|
|
74b78608d9 | ||
|
|
a11558b84f | ||
|
|
513696a485 | ||
|
|
cedc2eb5c2 | ||
|
|
e3e593f571 | ||
|
|
c12e393e74 | ||
|
|
d59cb2ca7a | ||
|
|
58f34a8d76 | ||
|
|
31462f4b71 | ||
|
|
538f903861 | ||
|
|
e6a7241c3a | ||
|
|
709b778904 | ||
|
|
aa8debf4e8 | ||
|
|
1912546e52 | ||
|
|
a6178c135f | ||
|
|
2ff16da6af | ||
|
|
21ea70c8f5 | ||
|
|
2b2d24433a | ||
|
|
66bced0f36 | ||
|
|
9ba7bc2695 | ||
|
|
8624bddc79 | ||
|
|
45b1495f37 | ||
|
|
23be5021f8 | ||
|
|
f954d5c501 | ||
|
|
ab2f0ad1a8 | ||
|
|
52fbcbde0a | ||
|
|
e602807476 | ||
|
|
398d522d88 | ||
|
|
746f667311 | ||
|
|
53ea6702bd | ||
|
|
952424b78c | ||
|
|
d737c40eec | ||
|
|
532918e13d | ||
|
|
b266c28345 | ||
|
|
04dc698d4b | ||
|
|
6b11b4250e | ||
|
|
15d1c1f8bf | ||
|
|
9ece1e863d | ||
|
|
2870150365 | ||
|
|
7b281900f9 | ||
|
|
97992226d3 | ||
|
|
270356ec38 | ||
|
|
c2db828481 | ||
|
|
71e93faed7 | ||
|
|
54d52e07db | ||
|
|
4dccdb33ab | ||
|
|
38c4b6f02f | ||
|
|
6ff3f1b9fd | ||
|
|
4c5e23d014 | ||
|
|
99d80aba52 | ||
|
|
2f2dff4c8d | ||
|
|
22e7fcbf2d | ||
|
|
372617a4f5 | ||
|
|
49d1921a28 | ||
|
|
d8e509d29e | ||
|
|
d5bfe84d9e | ||
|
|
8fff26ad49 | ||
|
|
5f4e32f505 | ||
|
|
fb71c85a79 | ||
|
|
ff76226a35 | ||
|
|
6e748147b6 | ||
|
|
e5df42feef | ||
|
|
73647e5715 | ||
|
|
95db33f3f9 | ||
|
|
bace19ffbe | ||
|
|
60d66267a9 | ||
|
|
294320e6a8 | ||
|
|
28b4d9abb3 | ||
|
|
8d8bc304c1 | ||
|
|
4788248e11 | ||
|
|
0cbb3798da | ||
|
|
36c12247b9 | ||
|
|
1767208563 | ||
|
|
d25656797c | ||
|
|
6c825dcbaa | ||
|
|
4b46693c81 | ||
|
|
8952066ecb | ||
|
|
d26b76fe7c | ||
|
|
df5a55c445 | ||
|
|
e5e5c3e067 | ||
|
|
b7575582b8 | ||
|
|
77fd24b950 | ||
|
|
61af9bb889 | ||
|
|
a68f60415b | ||
|
|
e7ca580922 | ||
|
|
33d126ecbe | ||
|
|
15db0d1d6f | ||
|
|
29f122009a | ||
|
|
bf0a0cb55d | ||
|
|
0fe5abadf5 | ||
|
|
1591f058c6 | ||
|
|
efa4ecaa7c | ||
|
|
8e57c2e413 | ||
|
|
4dd63821bd | ||
|
|
eeec1a3dcb | ||
|
|
b484b896b6 | ||
|
|
e5413be5fa | ||
|
|
b9c0d22045 | ||
|
|
2e0d45d092 | ||
|
|
86932c20eb | ||
|
|
f5b45a172c | ||
|
|
e6a0987182 | ||
|
|
aa64391265 | ||
|
|
aac913f9dc | ||
|
|
4e2e5bb4e6 | ||
|
|
3e15a5c325 | ||
|
|
ce646ea845 | ||
|
|
effcabb590 | ||
|
|
a08dfb1c2c | ||
|
|
a3818dee58 | ||
|
|
219cbe2d9c | ||
|
|
129f85f652 | ||
|
|
790f1b05c6 | ||
|
|
37cd662ab2 | ||
|
|
277a4d4582 | ||
|
|
1cdeba9db7 | ||
|
|
7d104e5660 | ||
|
|
49530145d8 | ||
|
|
da96965897 | ||
|
|
3762b53986 | ||
|
|
9ad99152b8 | ||
|
|
651a8139f5 | ||
|
|
f82c3eb5e2 | ||
|
|
eea6f0898e | ||
|
|
086c0ad829 | ||
|
|
6c7ea82a61 | ||
|
|
b77597bd99 | ||
|
|
68aa2febc9 | ||
|
|
1369145e83 | ||
|
|
b49164a1d4 | ||
|
|
e7b112aacc | ||
|
|
f491a22d85 | ||
|
|
26115818b7 | ||
|
|
158d1bbbb4 | ||
|
|
6a43b293ad | ||
|
|
69df9f10ed | ||
|
|
61aee52a90 | ||
|
|
975b2d12dc | ||
|
|
ab61ce2267 | ||
|
|
14168c7aa7 | ||
|
|
7a8501d12f | ||
|
|
34d55b09a3 | ||
|
|
41a3772e90 | ||
|
|
bbec5a13bd | ||
|
|
421d586953 | ||
|
|
ef37eb96b9 | ||
|
|
d311f708b6 | ||
|
|
c7f54af1f1 | ||
|
|
44a85d9176 | ||
|
|
96beffb3c5 | ||
|
|
cff671c1bd | ||
|
|
4acdcbe90f | ||
|
|
fdf6829de5 | ||
|
|
b361558a8a | ||
|
|
c59830fd01 | ||
|
|
636194406f | ||
|
|
3b09a74f58 | ||
|
|
f617115467 | ||
|
|
4f529b7d4a | ||
|
|
bc652e965e | ||
|
|
3b9e7fc5e6 | ||
|
|
5292b502f3 | ||
|
|
abcecc992e | ||
|
|
96b6f350a7 | ||
|
|
648755a25e | ||
|
|
1c775bdcac | ||
|
|
07d0241076 | ||
|
|
d760446053 | ||
|
|
01e239afa3 | ||
|
|
f62ce4bcf7 | ||
|
|
3d3eb0ed16 | ||
|
|
da9bf5dc63 | ||
|
|
1cb9b5523b | ||
|
|
968cd8f20c | ||
|
|
3e007b0eb9 | ||
|
|
5e0cc89de8 | ||
|
|
0fc05569e0 | ||
|
|
021462da3e | ||
|
|
93d7d2ae2a | ||
|
|
fe79082e29 | ||
|
|
6dfe196c40 | ||
|
|
8beaf76c85 | ||
|
|
499b4f7eba | ||
|
|
52ee3a2bac | ||
|
|
b64bd2a8af | ||
|
|
573f1ada83 | ||
|
|
904ccbdb70 | ||
|
|
59b23fef64 | ||
|
|
0eaff5aa7f | ||
|
|
db5712f28b | ||
|
|
5f277755b1 | ||
|
|
ee87e6aad3 | ||
|
|
ff3488fadd | ||
|
|
4a0a9e748c | ||
|
|
6aa38d3f7d | ||
|
|
28f2800275 | ||
|
|
8af5cbedb1 | ||
|
|
75baf670f5 | ||
|
|
2ca8fbb6ff | ||
|
|
546266b86d | ||
|
|
c5a8c31b8a | ||
|
|
bab954b87f | ||
|
|
3ded550272 | ||
|
|
ed30f2096c | ||
|
|
da9508716d | ||
|
|
2dbbb8c59b | ||
|
|
f3192ee415 | ||
|
|
9e7c45cb72 | ||
|
|
18ba16aaac | ||
|
|
a4fd1e1a80 | ||
|
|
9b71ae7dce | ||
|
|
2cd730d31f | ||
|
|
8060e17b50 | ||
|
|
1f3f4cfaf5 | ||
|
|
a22cb7acc1 | ||
|
|
785502c92c | ||
|
|
69b786040e | ||
|
|
4f3f0304c2 | ||
|
|
c981f4ad66 | ||
|
|
c794f128cc | ||
|
|
220a023e51 | ||
|
|
e911427872 | ||
|
|
eb42fbadeb | ||
|
|
d8fa2ec367 | ||
|
|
07507274c0 | ||
|
|
92e4f4b3b6 | ||
|
|
b5a5ea5831 | ||
|
|
f387769203 | ||
|
|
7f777a485e | ||
|
|
d8ab2e00cb | ||
|
|
f520ef9a64 | ||
|
|
d047a3abf7 | ||
|
|
f69db17409 | ||
|
|
3600b33f1c | ||
|
|
2c5fb6d6c8 | ||
|
|
8604bb8750 | ||
|
|
936cad17e4 | ||
|
|
fa5d31056b | ||
|
|
583f64768f | ||
|
|
c5d56ffe22 | ||
|
|
b451ede199 | ||
|
|
533087fd5d | ||
|
|
95160dee6d | ||
|
|
8aa3013ec2 | ||
|
|
8879f747ee | ||
|
|
9809613c6f | ||
|
|
8d1bf152cf | ||
|
|
3725815935 | ||
|
|
b32cc6a088 | ||
|
|
3c7f810849 | ||
|
|
1e65848551 | ||
|
|
e03417a7c9 | ||
|
|
33ee5b6ba0 | ||
|
|
52d6275812 | ||
|
|
639c9e8266 | ||
|
|
35e0099ac6 | ||
|
|
4ff248515b | ||
|
|
8b70ea4d79 | ||
|
|
2246b48348 | ||
|
|
e8032f26e6 | ||
|
|
d2c3ad162a | ||
|
|
b4c5cb2773 | ||
|
|
b67df00bff | ||
|
|
24c3e961e4 | ||
|
|
92fb7a1641 | ||
|
|
05886b33e5 | ||
|
|
d7eeaec706 | ||
|
|
1190030872 | ||
|
|
913a91c541 | ||
|
|
24b925d528 | ||
|
|
82dc1e82ba | ||
|
|
2e9c730dd1 | ||
|
|
6266fd102c | ||
|
|
d9bc2109bb | ||
|
|
d1d6c968d5 | ||
|
|
3c4ebc4030 | ||
|
|
46543f54a6 | ||
|
|
b07fa4c896 | ||
|
|
f35d13183e | ||
|
|
c5f379bff3 | ||
|
|
39ebec51d1 | ||
|
|
6264dc6aa3 | ||
|
|
59163cf3b3 | ||
|
|
a606336074 | ||
|
|
1816c4ca0a | ||
|
|
542dffa4a6 | ||
|
|
07fb30747a |
267
.circleci/config.yml
Normal file
267
.circleci/config.yml
Normal file
@@ -0,0 +1,267 @@
|
||||
version: 2.1
|
||||
|
||||
orbs:
|
||||
python: circleci/python@1.4.0
|
||||
|
||||
executors:
|
||||
zenith-build-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
- image: cimg/rust:1.51.0
|
||||
|
||||
jobs:
|
||||
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: zenith-build-executor
|
||||
steps:
|
||||
# Checkout the git repo (circleci doesn't have a flag to enable submodules here)
|
||||
- checkout
|
||||
|
||||
# Grab the postgres git revision to build a cache key.
|
||||
# Note this works even though the submodule hasn't been checkout out yet.
|
||||
- run:
|
||||
name: Get postgres cache key
|
||||
command: |
|
||||
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
|
||||
|
||||
- restore_cache:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
# FIXME We could cache our own docker container, instead of installing packages every time.
|
||||
- run:
|
||||
name: apt install dependencies
|
||||
command: |
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libxml2-dev libcurl4-openssl-dev
|
||||
fi
|
||||
|
||||
# Build postgres if the restore_cache didn't find a build.
|
||||
# `make` can't figure out whether the cache is valid, since
|
||||
# it only compares file timestamps.
|
||||
- run:
|
||||
name: build postgres
|
||||
command: |
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
# "depth 1" saves some time by not cloning the whole repo
|
||||
git submodule update --init --depth 1
|
||||
make postgres
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save postgres cache
|
||||
key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
# A job to build zenith rust code
|
||||
build-zenith:
|
||||
executor: zenith-build-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
enum: ["debug", "release"]
|
||||
steps:
|
||||
- run:
|
||||
name: apt install dependencies
|
||||
command: |
|
||||
sudo apt update
|
||||
sudo apt install libssl-dev clang
|
||||
|
||||
# Checkout the git repo (without submodules)
|
||||
- checkout
|
||||
|
||||
# Grab the postgres git revision to build a cache key.
|
||||
# Note this works even though the submodule hasn't been checkout out yet.
|
||||
- run:
|
||||
name: Get postgres cache key
|
||||
command: |
|
||||
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
|
||||
|
||||
- restore_cache:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
keys:
|
||||
# Require an exact match. While an out of date cache might speed up the build,
|
||||
# there's no way to clean out old packages, so the cache grows every time something
|
||||
# changes.
|
||||
- v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
|
||||
# Build the rust code, including test binaries
|
||||
- run:
|
||||
name: Rust build << parameters.build_type >>
|
||||
command: |
|
||||
export CARGO_INCREMENTAL=0
|
||||
BUILD_TYPE="<< parameters.build_type >>"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
echo "Build in debug mode"
|
||||
cargo build --bins --tests
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
echo "Build in release mode"
|
||||
cargo build --release --bins --tests
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save rust cache
|
||||
key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
paths:
|
||||
- ~/.cargo/registry
|
||||
- ~/.cargo/git
|
||||
- target
|
||||
|
||||
# Run rust unit tests
|
||||
# FIXME: remove -p zenith_utils once integration tests are moved to python
|
||||
- run: cargo test -p zenith_utils
|
||||
|
||||
# Install the rust binaries, for use by test jobs
|
||||
# `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
|
||||
# FIXME: this is a really silly way to install; maybe we should just output
|
||||
# a tarball as an artifact? Or a .deb package?
|
||||
- run:
|
||||
name: cargo install
|
||||
command: |
|
||||
export CARGO_INCREMENTAL=0
|
||||
BUILD_TYPE="<< parameters.build_type >>"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
echo "Install debug mode"
|
||||
CARGO_FLAGS="--debug"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
echo "Install release mode"
|
||||
# The default is release mode; there is no --release flag.
|
||||
CARGO_FLAGS=""
|
||||
fi
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith
|
||||
|
||||
# Install the postgres binaries, for use by test jobs
|
||||
# FIXME: this is a silly way to do "install"; maybe just output a standard
|
||||
# postgres package, whatever the favored form is (tarball? .deb package?)
|
||||
# Note that pg_regress needs some build artifacts that probably aren't
|
||||
# in the usual package...?
|
||||
- run:
|
||||
name: postgres install
|
||||
command: |
|
||||
cp -a tmp_install /tmp/zenith/pg_install
|
||||
|
||||
# Save the rust output binaries for other jobs in this workflow.
|
||||
- persist_to_workspace:
|
||||
root: /tmp/zenith
|
||||
paths:
|
||||
- "*"
|
||||
|
||||
run-pytest:
|
||||
#description: "Run pytest"
|
||||
executor: python/default
|
||||
parameters:
|
||||
# pytest args to specify the tests to run.
|
||||
#
|
||||
# This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory,
|
||||
# or '-k foobar' to run tests containing string 'foobar'. See pytest man page
|
||||
# section SPECIFYING TESTS / SELECTING TESTS for details.
|
||||
#
|
||||
# Select the type of Rust build. Must be "release" or "debug".
|
||||
build_type:
|
||||
type: string
|
||||
default: "debug"
|
||||
# This parameter is required, to prevent the mistake of running all tests in one job.
|
||||
test_selection:
|
||||
type: string
|
||||
default: ""
|
||||
# Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
|
||||
extra_params:
|
||||
type: string
|
||||
default: ""
|
||||
needs_postgres_source:
|
||||
type: boolean
|
||||
default: false
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
- checkout
|
||||
- when:
|
||||
condition: << parameters.needs_postgres_source >>
|
||||
steps:
|
||||
- run: git submodule update --init --depth 1
|
||||
- run: pip install pytest psycopg2
|
||||
- run:
|
||||
name: Run pytest
|
||||
working_directory: test_runner
|
||||
environment:
|
||||
- ZENITH_BIN: /tmp/zenith/bin
|
||||
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
||||
- TEST_OUTPUT: /tmp/test_output
|
||||
command: |
|
||||
TEST_SELECTION="<< parameters.test_selection >>"
|
||||
EXTRA_PARAMS="<< parameters.extra_params >>"
|
||||
if [ -z "$TEST_SELECTION" ]; then
|
||||
echo "test_selection must be set"
|
||||
exit 1
|
||||
fi
|
||||
# Run the tests.
|
||||
#
|
||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||
# in its "Tests" tab in the results page.
|
||||
pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short $TEST_SELECTION $EXTRA_PARAMS
|
||||
- run:
|
||||
# CircleCI artifacts are preserved one file at a time, so skipping
|
||||
# this step isn't a good idea. If you want to extract the
|
||||
# pageserver state, perhaps a tarball would be a better idea.
|
||||
name: Delete pageserver data
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
for DIR in /tmp/test_output/*; do
|
||||
mv $DIR/repo/pageserver.log $DIR/ || true # ignore errors
|
||||
for PGDIR in $DIR/repo/pgdatadirs/pg?; do
|
||||
echo "PGDIR: $PGDIR"
|
||||
NEW_LOG="${PGDIR##*/}_log"
|
||||
mv $PGDIR/log "$DIR/$NEW_LOG" || true # ignore errors
|
||||
done
|
||||
echo "rm $DIR/repo"
|
||||
rm -rf $DIR/repo
|
||||
done
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
# The store_test_results step tells CircleCI where to find the junit.xml file.
|
||||
- store_test_results:
|
||||
path: /tmp/test_output
|
||||
|
||||
workflows:
|
||||
build_and_test:
|
||||
jobs:
|
||||
- build-postgres
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
requires:
|
||||
- build-postgres
|
||||
- run-pytest:
|
||||
name: pg_regress tests << matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
test_selection: batch_pg_regress
|
||||
needs_postgres_source: true
|
||||
requires:
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: other tests << matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
test_selection: batch_others
|
||||
requires:
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
45
.github/workflows/notifications.yml
vendored
Normal file
45
.github/workflows/notifications.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: Send Notifications
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
send-notifications:
|
||||
timeout-minutes: 30
|
||||
name: send commit notifications
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Form variables for notification message
|
||||
id: git_info_grab
|
||||
run: |
|
||||
git_stat=$(git show --stat=50)
|
||||
git_stat="${git_stat//'%'/'%25'}"
|
||||
git_stat="${git_stat//$'\n'/'%0A'}"
|
||||
git_stat="${git_stat//$'\r'/'%0D'}"
|
||||
git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces
|
||||
echo "::set-output name=git_stat::$git_stat"
|
||||
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
|
||||
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
|
||||
|
||||
- name: Send notification
|
||||
uses: appleboy/telegram-action@master
|
||||
with:
|
||||
to: ${{ secrets.TELEGRAM_TO }}
|
||||
token: ${{ secrets.TELEGRAM_TOKEN }}
|
||||
format: markdown
|
||||
args: |
|
||||
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
|
||||
|
||||
```
|
||||
${{ steps.git_info_grab.outputs.git_stat }}
|
||||
```
|
||||
|
||||
63
.github/workflows/testing.yml
vendored
63
.github/workflows/testing.yml
vendored
@@ -1,44 +1,36 @@
|
||||
name: regression check
|
||||
name: Build and Test
|
||||
|
||||
on: [push]
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
regression-check:
|
||||
strategy:
|
||||
matrix:
|
||||
# If we want to duplicate this job for different
|
||||
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
|
||||
rust_toolchain: [stable]
|
||||
os: [ubuntu-latest]
|
||||
timeout-minutes: 30
|
||||
name: run regression test suite
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Form variables for notification message
|
||||
id: git_info_grab
|
||||
run: |
|
||||
git_stat=$(git show --stat=50)
|
||||
git_stat="${git_stat//'%'/'%25'}"
|
||||
git_stat="${git_stat//$'\n'/'%0A'}"
|
||||
git_stat="${git_stat//$'\r'/'%0D'}"
|
||||
git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces
|
||||
echo "::set-output name=git_stat::$git_stat"
|
||||
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
|
||||
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
|
||||
|
||||
- name: Send notification
|
||||
uses: appleboy/telegram-action@master
|
||||
- name: install rust toolchain ${{ matrix.rust_toolchain }}
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
to: ${{ secrets.TELEGRAM_TO }}
|
||||
token: ${{ secrets.TELEGRAM_TOKEN }}
|
||||
format: markdown
|
||||
args: |
|
||||
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
|
||||
|
||||
```
|
||||
${{ steps.git_info_grab.outputs.git_stat }}
|
||||
```
|
||||
profile: minimal
|
||||
toolchain: ${{ matrix.rust_toolchain }}
|
||||
override: true
|
||||
|
||||
- name: Install postgres dependencies
|
||||
run: |
|
||||
@@ -60,11 +52,7 @@ jobs:
|
||||
- name: Build postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
./pgbuild.sh
|
||||
|
||||
- name: Install rust
|
||||
run: |
|
||||
sudo apt install -y cargo
|
||||
make postgres
|
||||
|
||||
- name: Cache cargo deps
|
||||
id: cache_cargo
|
||||
@@ -76,13 +64,10 @@ jobs:
|
||||
target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
|
||||
|
||||
# That build is only to build dependencies and can be skipped if Cargo.lock
|
||||
# wasn't changed. Next steps need their own build
|
||||
- name: Install cargo deps
|
||||
if: steps.cache_cargo.outputs.cache-hit != 'true'
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
cargo build
|
||||
cargo build --workspace --bins --examples --tests
|
||||
|
||||
- name: Run test
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
cargo test --test test_pageserver -- --nocapture --test-threads=1
|
||||
cargo test -- --nocapture --test-threads=1
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1,3 +1,9 @@
|
||||
/target
|
||||
/tmp_check
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
__pycache__/
|
||||
test_output/
|
||||
.vscode
|
||||
/.zenith
|
||||
/integration_tests/.zenith
|
||||
|
||||
1073
Cargo.lock
generated
1073
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -3,4 +3,9 @@ members = [
|
||||
"integration_tests",
|
||||
"pageserver",
|
||||
"walkeeper",
|
||||
"zenith",
|
||||
"control_plane",
|
||||
"postgres_ffi",
|
||||
"zenith_utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
57
Makefile
Normal file
57
Makefile
Normal file
@@ -0,0 +1,57 @@
|
||||
#
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
all: zenith postgres
|
||||
|
||||
# We don't want to run 'cargo build' in parallel with the postgres build,
|
||||
# because interleaving cargo build output with postgres build output looks
|
||||
# confusing. Also, 'cargo build' is parallel on its own, so it would be too
|
||||
# much parallelism. (Recursive invocation of postgres target still gets any
|
||||
# '-j' flag from the command line, so 'make -j' is still useful.)
|
||||
.NOTPARALLEL:
|
||||
|
||||
### Zenith Rust bits
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
zenith: postgres-headers
|
||||
cargo build
|
||||
|
||||
### PostgreSQL parts
|
||||
tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
|
||||
--enable-depend --with-libxml --prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
# nicer alias for running 'configure'
|
||||
postgres-configure: tmp_install/build/config.status
|
||||
|
||||
# Install the PostgreSQL header files into tmp_install/include
|
||||
postgres-headers: postgres-configure
|
||||
+@echo "Installing PostgreSQL headers"
|
||||
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
|
||||
|
||||
|
||||
# Compile and install PostgreSQL and contrib/zenith
|
||||
postgres: postgres-configure
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
||||
+@echo "Compiling contrib/zenith"
|
||||
(cd vendor/postgres/contrib/zenith && \
|
||||
$(MAKE) PG_CONFIG=$(abspath tmp_install)/bin/pg_config install USE_PGXS=1)
|
||||
|
||||
postgres-clean:
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
clean:
|
||||
cd tmp_install/build && ${MAKE} clean
|
||||
cargo clean
|
||||
|
||||
# This removes everything
|
||||
distclean:
|
||||
rm -rf tmp_install
|
||||
cargo clean
|
||||
|
||||
.PHONY: postgres-configure postgres postgres-headers zenith
|
||||
85
README.md
85
README.md
@@ -2,12 +2,88 @@
|
||||
|
||||
Zenith substitutes PostgreSQL storage layer and redistributes data across a cluster of nodes
|
||||
|
||||
## Running local installation
|
||||
|
||||
1. Build zenith and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
cd zenith
|
||||
make -j5
|
||||
```
|
||||
|
||||
2. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
```sh
|
||||
# Create repository in .zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
> ./target/debug/zenith init
|
||||
<...>
|
||||
new zenith repository was created in .zenith
|
||||
|
||||
# start pageserver
|
||||
> ./target/debug/zenith start
|
||||
Starting pageserver at '127.0.0.1:64000' in .zenith
|
||||
Pageserver started
|
||||
|
||||
# start postgres on top on the pageserver
|
||||
> ./target/debug/zenith pg start main
|
||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
|
||||
waiting for server to start.... done
|
||||
|
||||
# check list of running postgres instances
|
||||
> ./target/debug/zenith pg list
|
||||
BRANCH ADDRESS LSN STATUS
|
||||
main 127.0.0.1:55432 0/1609610 running
|
||||
```
|
||||
|
||||
3. Now it is possible to connect to postgres and run some queries:
|
||||
```sh
|
||||
> psql -p55432 -h 127.0.0.1 postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
postgres=# insert into t values(1,1);
|
||||
INSERT 0 1
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
```
|
||||
|
||||
4. And create branches and run postgres on them:
|
||||
```sh
|
||||
# create branch named migration_check
|
||||
> ./target/debug/zenith branch migration_check main
|
||||
Created branch 'migration_check' at 0/1609610
|
||||
|
||||
# check branches tree
|
||||
> ./target/debug/zenith branch
|
||||
main
|
||||
┗━ @0/1609610: migration_check
|
||||
|
||||
# start postgres on that branch
|
||||
> ./target/debug/zenith pg start migration_check
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
|
||||
waiting for server to start.... done
|
||||
|
||||
# this new postgres instance will have all the data from 'main' postgres,
|
||||
# but all modifications would not affect data in original postgres
|
||||
> psql -p55433 -h 127.0.0.1 postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
|
||||
postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
./pgbuild.sh # builds postgres and installs it to ./tmp_install
|
||||
cargo test -- --test-threads=1
|
||||
make # builds also postgres and installs it to ./tmp_install
|
||||
pytest
|
||||
```
|
||||
|
||||
## Source tree layout
|
||||
@@ -26,11 +102,6 @@ Depends on the modified 'postgres' binary for WAL redo.
|
||||
|
||||
Tests with different combinations of a Postgres compute node, WAL safekeeper and Page Server.
|
||||
|
||||
/mgmt-console:
|
||||
|
||||
Web UI to launch (modified) Postgres servers, using S3 as the backing store. Written in Python.
|
||||
This is somewhat outdated, as it doesn't use the WAL safekeeper or Page Servers.
|
||||
|
||||
/vendor/postgres:
|
||||
|
||||
PostgreSQL source tree, with the modifications needed for Zenith.
|
||||
|
||||
188
cli-v2-story.md
Normal file
188
cli-v2-story.md
Normal file
@@ -0,0 +1,188 @@
|
||||
Create a new Zenith repository in the current directory:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init
|
||||
The files belonging to this database system will be owned by user "heikki".
|
||||
This user must also own the server process.
|
||||
|
||||
The database cluster will be initialized with locale "en_GB.UTF-8".
|
||||
The default database encoding has accordingly been set to "UTF8".
|
||||
The default text search configuration will be set to "english".
|
||||
|
||||
Data page checksums are disabled.
|
||||
|
||||
creating directory tmp ... ok
|
||||
creating subdirectories ... ok
|
||||
selecting dynamic shared memory implementation ... posix
|
||||
selecting default max_connections ... 100
|
||||
selecting default shared_buffers ... 128MB
|
||||
selecting default time zone ... Europe/Helsinki
|
||||
creating configuration files ... ok
|
||||
running bootstrap script ... ok
|
||||
performing post-bootstrap initialization ... ok
|
||||
syncing data to disk ... ok
|
||||
|
||||
initdb: warning: enabling "trust" authentication for local connections
|
||||
You can change this by editing pg_hba.conf or using the option -A, or
|
||||
--auth-local and --auth-host, the next time you run initdb.
|
||||
new zenith repository was created in .zenith
|
||||
|
||||
Initially, there is only one branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
|
||||
main
|
||||
|
||||
Start a local Postgres instance on the branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv6 address "::1", port 5432
|
||||
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv4 address "127.0.0.1", port 5432
|
||||
2021-04-13 09:27:43.927 EEST [984664] LOG: listening on Unix socket "/tmp/.s.PGSQL.5432"
|
||||
2021-04-13 09:27:43.939 EEST [984665] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:27:43.939 EEST [984665] LOG: creating missing WAL directory "pg_wal/archive_status"
|
||||
2021-04-13 09:27:44.189 EEST [984665] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:27:44.195 EEST [984665] LOG: invalid record length at 0/15FFB80: wanted 24, got 0
|
||||
2021-04-13 09:27:44.195 EEST [984665] LOG: redo is not required
|
||||
2021-04-13 09:27:44.225 EEST [984664] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
|
||||
Run some commands against it:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);"
|
||||
CREATE TABLE
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');"
|
||||
INSERT 0 1
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
Create a new branch called 'experimental'. We create it from the
|
||||
current end of the 'main' branch, but you could specify a different
|
||||
LSN as the start point instead.
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main
|
||||
branching at end of WAL: 0/161F478
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
|
||||
experimental
|
||||
main
|
||||
|
||||
Start another Postgres instance off the 'experimental' branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv6 address "::1", port 5433
|
||||
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv4 address "127.0.0.1", port 5433
|
||||
2021-04-13 09:28:41.883 EEST [984766] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
|
||||
2021-04-13 09:28:41.896 EEST [984767] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:28:42.265 EEST [984767] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:28:42.269 EEST [984767] LOG: redo starts at 0/15FFB80
|
||||
2021-04-13 09:28:42.272 EEST [984767] LOG: invalid record length at 0/161F4B0: wanted 24, got 0
|
||||
2021-04-13 09:28:42.272 EEST [984767] LOG: redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
|
||||
2021-04-13 09:28:42.321 EEST [984766] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
|
||||
Insert some a row on the 'experimental' branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')"
|
||||
INSERT 0 1
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
inserted on experimental
|
||||
(2 rows)
|
||||
|
||||
See that the other Postgres instance is still running on 'main' branch on port 5432:
|
||||
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
|
||||
|
||||
|
||||
Everything is stored in the .zenith directory:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/
|
||||
total 12
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines
|
||||
|
||||
The 'datadirs' directory contains the datadirs of the running instances:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/
|
||||
total 8
|
||||
drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e
|
||||
drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/
|
||||
total 124
|
||||
drwxr-xr-x 5 heikki heikki 4096 Apr 13 09:27 base
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 global
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_commit_ts
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_dynshmem
|
||||
-rw------- 1 heikki heikki 4760 Apr 13 09:27 pg_hba.conf
|
||||
-rw------- 1 heikki heikki 1636 Apr 13 09:27 pg_ident.conf
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:32 pg_logical
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 pg_multixact
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_notify
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_replslot
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_serial
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_snapshots
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_stat
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:34 pg_stat_tmp
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_subtrans
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_tblspc
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_twophase
|
||||
-rw------- 1 heikki heikki 3 Apr 13 09:27 PG_VERSION
|
||||
lrwxrwxrwx 1 heikki heikki 52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_xact
|
||||
-rw------- 1 heikki heikki 88 Apr 13 09:27 postgresql.auto.conf
|
||||
-rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf
|
||||
-rw------- 1 heikki heikki 96 Apr 13 09:27 postmaster.opts
|
||||
-rw------- 1 heikki heikki 149 Apr 13 09:27 postmaster.pid
|
||||
|
||||
Note how 'pg_wal' is just a symlink to the 'timelines' directory. The
|
||||
datadir is ephemeral, you can delete it at any time, and it can be reconstructed
|
||||
from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull
|
||||
the repository, the 'datadirs' are not included. (They are like git working trees)
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ killall -9 postgres
|
||||
~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/*
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv6 address "::1", port 5433
|
||||
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv4 address "127.0.0.1", port 5433
|
||||
2021-04-13 09:37:05.487 EEST [985340] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
|
||||
2021-04-13 09:37:05.498 EEST [985341] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:37:05.808 EEST [985341] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:37:05.813 EEST [985341] LOG: redo starts at 0/15FFB80
|
||||
2021-04-13 09:37:05.815 EEST [985341] LOG: invalid record length at 0/161F770: wanted 24, got 0
|
||||
2021-04-13 09:37:05.815 EEST [985341] LOG: redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
|
||||
2021-04-13 09:37:05.866 EEST [985340] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
inserted on experimental
|
||||
(2 rows)
|
||||
|
||||
1
control_plane/.gitignore
vendored
Normal file
1
control_plane/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
tmp_check/
|
||||
30
control_plane/Cargo.toml
Normal file
30
control_plane/Cargo.toml
Normal file
@@ -0,0 +1,30 @@
|
||||
[package]
|
||||
name = "control_plane"
|
||||
version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
rand = "0.8.3"
|
||||
tar = "0.4.33"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
toml = "0.5"
|
||||
lazy_static = "1.4"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
# hex = "0.4.3"
|
||||
bytes = "1.0.1"
|
||||
# fs_extra = "1.2.0"
|
||||
nix = "0.20"
|
||||
# thiserror = "1"
|
||||
url = "2.2.2"
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
467
control_plane/src/compute.rs
Normal file
467
control_plane/src/compute.rs
Normal file
@@ -0,0 +1,467 @@
|
||||
use std::io::Write;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::process::Command;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{collections::BTreeMap, path::PathBuf};
|
||||
use std::{
|
||||
fs::{self, OpenOptions},
|
||||
io::Read,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
use crate::storage::PageServerNode;
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
//
|
||||
pub struct ComputeControlPlane {
|
||||
base_port: u16,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
pub nodes: BTreeMap<String, Arc<PostgresNode>>,
|
||||
env: LocalEnv,
|
||||
}
|
||||
|
||||
impl ComputeControlPlane {
|
||||
// Load current nodes with ports from data directories on disk
|
||||
pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
|
||||
// TODO: since pageserver do not have config file yet we believe here that
|
||||
// it is running on default port. Change that when pageserver will have config.
|
||||
let pageserver = Arc::new(PageServerNode::from_env(&env));
|
||||
|
||||
let pgdatadirspath = &env.pg_data_dirs_path();
|
||||
let nodes: Result<BTreeMap<_, _>> = fs::read_dir(&pgdatadirspath)
|
||||
.with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
|
||||
.into_iter()
|
||||
.map(|f| {
|
||||
PostgresNode::from_dir_entry(f?, &env, &pageserver)
|
||||
.map(|node| (node.name.clone(), Arc::new(node)))
|
||||
})
|
||||
.collect();
|
||||
let nodes = nodes?;
|
||||
|
||||
Ok(ComputeControlPlane {
|
||||
base_port: 55431,
|
||||
pageserver,
|
||||
nodes,
|
||||
env,
|
||||
})
|
||||
}
|
||||
|
||||
fn get_port(&mut self) -> u16 {
|
||||
1 + self
|
||||
.nodes
|
||||
.iter()
|
||||
.map(|(_name, node)| node.address.port())
|
||||
.max()
|
||||
.unwrap_or(self.base_port)
|
||||
}
|
||||
|
||||
pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
|
||||
ComputeControlPlane {
|
||||
base_port: 65431,
|
||||
pageserver: Arc::clone(pageserver),
|
||||
nodes: BTreeMap::new(),
|
||||
env: local_env.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Connect to a page server, get base backup, and untar it to initialize a
|
||||
/// new data directory
|
||||
pub fn new_from_page_server(
|
||||
&mut self,
|
||||
is_test: bool,
|
||||
timelineid: ZTimelineId,
|
||||
name: &str,
|
||||
) -> Result<Arc<PostgresNode>> {
|
||||
let node = Arc::new(PostgresNode {
|
||||
name: name.to_owned(),
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
is_test,
|
||||
timelineid,
|
||||
});
|
||||
|
||||
node.init_from_page_server()?;
|
||||
self.nodes.insert(node.name.clone(), Arc::clone(&node));
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
pub fn new_test_node(&mut self, branch_name: &str) -> Arc<PostgresNode> {
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
.branch_get_by_name(branch_name)
|
||||
.expect("failed to get timeline_id")
|
||||
.timeline_id;
|
||||
|
||||
let node = self.new_from_page_server(true, timeline_id, branch_name);
|
||||
let node = node.unwrap();
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"shared_preload_libraries = zenith\n\
|
||||
zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_test_master_node(&mut self, branch_name: &str) -> Arc<PostgresNode> {
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
.branch_get_by_name(branch_name)
|
||||
.expect("failed to get timeline_id")
|
||||
.timeline_id;
|
||||
|
||||
let node = self
|
||||
.new_from_page_server(true, timeline_id, branch_name)
|
||||
.unwrap();
|
||||
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
"synchronous_standby_names = 'safekeeper_proxy'\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_node(&mut self, branch_name: &str) -> Result<Arc<PostgresNode>> {
|
||||
let timeline_id = self.pageserver.branch_get_by_name(branch_name)?.timeline_id;
|
||||
|
||||
let node = self.new_from_page_server(false, timeline_id, branch_name)?;
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"shared_preload_libraries = zenith\n\
|
||||
zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
)?;
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct PostgresNode {
|
||||
pub address: SocketAddr,
|
||||
name: String,
|
||||
pub env: LocalEnv,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
is_test: bool,
|
||||
pub timelineid: ZTimelineId,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
fn from_dir_entry(
|
||||
entry: std::fs::DirEntry,
|
||||
env: &LocalEnv,
|
||||
pageserver: &Arc<PageServerNode>,
|
||||
) -> Result<PostgresNode> {
|
||||
if !entry.file_type()?.is_dir() {
|
||||
anyhow::bail!(
|
||||
"PostgresNode::from_dir_entry failed: '{}' is not a directory",
|
||||
entry.path().display()
|
||||
);
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
|
||||
static ref CONF_TIMELINE_RE: Regex =
|
||||
Regex::new(r"(?m)^\s*zenith.zenith_timeline\s*=\s*'(\w+)'\s*$").unwrap();
|
||||
}
|
||||
|
||||
// parse data directory name
|
||||
let fname = entry.file_name();
|
||||
let name = fname.to_str().unwrap().to_string();
|
||||
|
||||
// find out tcp port in config file
|
||||
let cfg_path = entry.path().join("postgresql.conf");
|
||||
let config = fs::read_to_string(cfg_path.clone()).with_context(|| {
|
||||
format!(
|
||||
"failed to read config file in {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
)
|
||||
})?;
|
||||
|
||||
// parse port
|
||||
let err_msg = format!(
|
||||
"failed to find port definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let port: u16 = CONF_PORT_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
// parse timeline
|
||||
let err_msg = format!(
|
||||
"failed to find timeline definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let timelineid: ZTimelineId = CONF_TIMELINE_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
// ok now
|
||||
Ok(PostgresNode {
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
name,
|
||||
env: env.clone(),
|
||||
pageserver: Arc::clone(pageserver),
|
||||
is_test: false,
|
||||
timelineid,
|
||||
})
|
||||
}
|
||||
|
||||
// Connect to a page server, get base backup, and untar it to initialize a
|
||||
// new data directory
|
||||
pub fn init_from_page_server(&self) -> Result<()> {
|
||||
let pgdata = self.pgdata();
|
||||
println!(
|
||||
"Extracting base backup to create postgres instance: path={} port={}",
|
||||
pgdata.display(),
|
||||
self.address.port()
|
||||
);
|
||||
|
||||
// initialize data directory
|
||||
if self.is_test {
|
||||
fs::remove_dir_all(&pgdata).ok();
|
||||
}
|
||||
|
||||
let sql = format!("basebackup {}", self.timelineid);
|
||||
let mut client = self
|
||||
.pageserver
|
||||
.page_server_psql_client()
|
||||
.with_context(|| "connecting to page server failed")?;
|
||||
|
||||
fs::create_dir_all(&pgdata)
|
||||
.with_context(|| format!("could not create data directory {}", pgdata.display()))?;
|
||||
fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"could not set permissions in data directory {}",
|
||||
pgdata.display()
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
// FIXME: The compute node should be able to stream the WAL it needs from the WAL safekeepers or archive.
|
||||
// But that's not implemented yet. For now, 'pg_wal' is included in the base backup tarball that
|
||||
// we receive from the Page Server, so we don't need to create the empty 'pg_wal' directory here.
|
||||
//fs::create_dir_all(pgdata.join("pg_wal"))?;
|
||||
|
||||
let mut copyreader = client
|
||||
.copy_out(sql.as_str())
|
||||
.with_context(|| "page server 'basebackup' command failed")?;
|
||||
|
||||
// FIXME: Currently, we slurp the whole tarball into memory, and then extract it,
|
||||
// but we really should do this:
|
||||
//let mut ar = tar::Archive::new(copyreader);
|
||||
let mut buf = vec![];
|
||||
copyreader
|
||||
.read_to_end(&mut buf)
|
||||
.with_context(|| "reading base backup from page server failed")?;
|
||||
let mut ar = tar::Archive::new(buf.as_slice());
|
||||
ar.unpack(&pgdata)
|
||||
.with_context(|| "extracting page backup failed")?;
|
||||
|
||||
// listen for selected port
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
fsync = off\n\
|
||||
max_connections = 100\n\
|
||||
wal_sender_timeout = 0\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n",
|
||||
address = self.address.ip(),
|
||||
port = self.address.port()
|
||||
),
|
||||
)?;
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeepr or
|
||||
// page server yet. But this will do for now.
|
||||
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;
|
||||
|
||||
// Connect it to the page server.
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"shared_preload_libraries = zenith \n\
|
||||
zenith.page_server_connstring = 'host={} port={}'\n\
|
||||
zenith.zenith_timeline='{}'\n",
|
||||
self.pageserver.address().ip(),
|
||||
self.pageserver.address().port(),
|
||||
self.timelineid
|
||||
),
|
||||
)?;
|
||||
|
||||
fs::create_dir_all(self.pgdata().join("pg_wal"))?;
|
||||
fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
|
||||
self.pg_resetwal(&["-f"])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn pgdata(&self) -> PathBuf {
|
||||
self.env.pg_data_dir(&self.name)
|
||||
}
|
||||
|
||||
pub fn status(&self) -> &str {
|
||||
let timeout = Duration::from_millis(300);
|
||||
let has_pidfile = self.pgdata().join("postmaster.pid").exists();
|
||||
let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
|
||||
|
||||
match (has_pidfile, can_connect) {
|
||||
(true, true) => "running",
|
||||
(false, false) => "stopped",
|
||||
(true, false) => "crashed",
|
||||
(false, true) => "running, no pidfile",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append_conf(&self, config: &str, opts: &str) -> Result<()> {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata().join(config).to_str().unwrap())?
|
||||
.write_all(opts.as_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str]) -> Result<()> {
|
||||
let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl");
|
||||
|
||||
let pg_ctl = Command::new(pg_ctl_path)
|
||||
.args(
|
||||
[
|
||||
&[
|
||||
"-D",
|
||||
self.pgdata().to_str().unwrap(),
|
||||
"-l",
|
||||
self.pgdata().join("log").to_str().unwrap(),
|
||||
],
|
||||
args,
|
||||
]
|
||||
.concat(),
|
||||
)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.with_context(|| "pg_ctl failed")?;
|
||||
if !pg_ctl.success() {
|
||||
anyhow::bail!("pg_ctl failed");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pg_resetwal(&self, args: &[&str]) -> Result<()> {
|
||||
let pg_resetwal_path = self.env.pg_bin_dir().join("pg_resetwal");
|
||||
|
||||
let pg_ctl = Command::new(pg_resetwal_path)
|
||||
.args([&["-D", self.pgdata().to_str().unwrap()], args].concat())
|
||||
.status()
|
||||
.with_context(|| "pg_resetwal failed")?;
|
||||
if !pg_ctl.success() {
|
||||
anyhow::bail!("pg_resetwal failed");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"])
|
||||
}
|
||||
|
||||
pub fn restart(&self) -> Result<()> {
|
||||
self.pg_ctl(&["restart"])
|
||||
}
|
||||
|
||||
pub fn stop(&self, destroy: bool) -> Result<()> {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"])?;
|
||||
if destroy {
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
self.pgdata().to_str().unwrap()
|
||||
);
|
||||
fs::remove_dir_all(&self.pgdata())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
format!(
|
||||
"host={} port={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
self.whoami()
|
||||
)
|
||||
}
|
||||
|
||||
// XXX: cache that in control plane
|
||||
pub fn whoami(&self) -> String {
|
||||
let output = Command::new("whoami")
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
// destructor to clean up state after test is done
|
||||
// XXX: we may detect failed test by setting some flag in catch_unwind()
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
if self.is_test {
|
||||
let _ = self.stop(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
31
control_plane/src/lib.rs
Normal file
31
control_plane/src/lib.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
//
|
||||
// Local control plane.
|
||||
//
|
||||
// Can start, cofigure and stop postgres instances running as a local processes.
|
||||
//
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
pub mod compute;
|
||||
pub mod local_env;
|
||||
pub mod storage;
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// We expect a file that contains a single integer.
|
||||
/// We return an i32 for compatibility with libc and nix.
|
||||
pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
|
||||
let pid_str = fs::read_to_string(pidfile)
|
||||
.with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
|
||||
let pid: i32 = pid_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
|
||||
if pid < 1 {
|
||||
bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
|
||||
}
|
||||
Ok(pid)
|
||||
}
|
||||
166
control_plane/src/local_env.rs
Normal file
166
control_plane/src/local_env.rs
Normal file
@@ -0,0 +1,166 @@
|
||||
//
|
||||
// This module is responsible for locating and loading paths in a local setup.
|
||||
//
|
||||
// Now it also provides init method which acts like a stub for proper installation
|
||||
// script which will use local paths.
|
||||
//
|
||||
use anyhow::{anyhow, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::{collections::BTreeMap, env};
|
||||
use url::Url;
|
||||
|
||||
pub type Remotes = BTreeMap<String, String>;
|
||||
|
||||
//
|
||||
// This data structures represent deserialized zenith CLI config
|
||||
//
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct LocalEnv {
|
||||
// Pageserver connection strings
|
||||
pub pageserver_connstring: String,
|
||||
|
||||
// Base directory for both pageserver and compute nodes
|
||||
pub base_data_dir: PathBuf,
|
||||
|
||||
// Path to postgres distribution. It's expected that "bin", "include",
|
||||
// "lib", "share" from postgres distribution are there. If at some point
|
||||
// in time we will be able to run against vanilla postgres we may split that
|
||||
// to four separate paths and match OS-specific installation layout.
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
|
||||
// Path to pageserver binary. Empty for remote pageserver.
|
||||
pub zenith_distrib_dir: Option<PathBuf>,
|
||||
|
||||
pub remotes: Remotes,
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
// postgres installation paths
|
||||
pub fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
}
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> Result<PathBuf> {
|
||||
Ok(self
|
||||
.zenith_distrib_dir
|
||||
.as_ref()
|
||||
.ok_or(anyhow!("Can not manage remote pageserver"))?
|
||||
.join("pageserver"))
|
||||
}
|
||||
|
||||
pub fn pg_data_dirs_path(&self) -> PathBuf {
|
||||
self.base_data_dir.join("pgdatadirs")
|
||||
}
|
||||
|
||||
pub fn pg_data_dir(&self, name: &str) -> PathBuf {
|
||||
self.pg_data_dirs_path().join(name)
|
||||
}
|
||||
|
||||
// TODO: move pageserver files into ./pageserver
|
||||
pub fn pageserver_data_dir(&self) -> PathBuf {
|
||||
self.base_data_dir.clone()
|
||||
}
|
||||
}
|
||||
|
||||
fn base_path() -> PathBuf {
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Initialize a new Zenith repository
|
||||
//
|
||||
pub fn init(remote_pageserver: Option<&str>) -> Result<()> {
|
||||
// check if config already exists
|
||||
let base_path = base_path();
|
||||
if base_path.exists() {
|
||||
anyhow::bail!(
|
||||
"{} already exists. Perhaps already initialized?",
|
||||
base_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// ok, now check that expected binaries are present
|
||||
|
||||
// Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
|
||||
let pg_distrib_dir: PathBuf = {
|
||||
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
||||
postgres_bin.into()
|
||||
} else {
|
||||
let cwd = env::current_dir()?;
|
||||
cwd.join("tmp_install")
|
||||
}
|
||||
};
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
|
||||
fs::create_dir(&base_path)?;
|
||||
fs::create_dir(base_path.join("pgdatadirs"))?;
|
||||
|
||||
let conf = if let Some(addr) = remote_pageserver {
|
||||
// check that addr is parsable
|
||||
let _uri = Url::parse(addr).map_err(|e| anyhow!("{}: {}", addr, e))?;
|
||||
|
||||
LocalEnv {
|
||||
pageserver_connstring: format!("postgresql://{}/", addr),
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir: None,
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
}
|
||||
} else {
|
||||
// Find zenith binaries.
|
||||
let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
if !zenith_distrib_dir.join("pageserver").exists() {
|
||||
anyhow::bail!("Can't find pageserver binary.",);
|
||||
}
|
||||
|
||||
LocalEnv {
|
||||
pageserver_connstring: "postgresql://127.0.0.1:6400".to_string(),
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir: Some(zenith_distrib_dir),
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
}
|
||||
};
|
||||
|
||||
let toml = toml::to_string_pretty(&conf)?;
|
||||
fs::write(conf.base_data_dir.join("config"), toml)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Locate and load config
|
||||
pub fn load_config() -> Result<LocalEnv> {
|
||||
let repopath = base_path();
|
||||
|
||||
if !repopath.exists() {
|
||||
anyhow::bail!(
|
||||
"Zenith config is not found in {}. You need to run 'zenith init' first",
|
||||
repopath.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: check that it looks like a zenith repository
|
||||
|
||||
// load and parse file
|
||||
let config = fs::read_to_string(repopath.join("config"))?;
|
||||
toml::from_str(config.as_str()).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
// Save config. We use that to change set of remotes from CLI itself.
|
||||
pub fn save_config(conf: &LocalEnv) -> Result<()> {
|
||||
let config_path = base_path().join("config");
|
||||
let conf_str = toml::to_string_pretty(conf)?;
|
||||
|
||||
fs::write(config_path, conf_str)?;
|
||||
Ok(())
|
||||
}
|
||||
252
control_plane/src/storage.rs
Normal file
252
control_plane/src/storage.rs
Normal file
@@ -0,0 +1,252 @@
|
||||
use std::collections::HashMap;
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::read_pidfile;
|
||||
use pageserver::branches::BranchInfo;
|
||||
|
||||
//
|
||||
// Control routines for pageserver.
|
||||
//
|
||||
// Used in CLI and tests.
|
||||
//
|
||||
pub struct PageServerNode {
|
||||
pub kill_on_exit: bool,
|
||||
pub listen_address: Option<SocketAddr>,
|
||||
pub env: LocalEnv,
|
||||
}
|
||||
|
||||
impl PageServerNode {
|
||||
pub fn from_env(env: &LocalEnv) -> PageServerNode {
|
||||
PageServerNode {
|
||||
kill_on_exit: false,
|
||||
listen_address: None, // default
|
||||
env: env.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn address(&self) -> SocketAddr {
|
||||
match self.listen_address {
|
||||
Some(addr) => addr,
|
||||
None => "127.0.0.1:64000".parse().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init(&self) -> Result<()> {
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let status = cmd
|
||||
.args(&["--init", "-D", self.env.base_data_dir.to_str().unwrap()])
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.env(
|
||||
"POSTGRES_DISTRIB_DIR",
|
||||
self.env.pg_distrib_dir.to_str().unwrap(),
|
||||
)
|
||||
.env("ZENITH_REPO_DIR", self.repo_path())
|
||||
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pageserver init failed");
|
||||
|
||||
if status.success() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow!("pageserver init failed"))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn repo_path(&self) -> PathBuf {
|
||||
self.env.pageserver_data_dir()
|
||||
}
|
||||
|
||||
pub fn pid_file(&self) -> PathBuf {
|
||||
self.repo_path().join("pageserver.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!(
|
||||
"Starting pageserver at '{}' in {}",
|
||||
self.address(),
|
||||
self.repo_path().display()
|
||||
);
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
cmd.args(&[
|
||||
"-l",
|
||||
self.address().to_string().as_str(),
|
||||
"-D",
|
||||
self.repo_path().to_str().unwrap(),
|
||||
])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.env(
|
||||
"POSTGRES_DISTRIB_DIR",
|
||||
self.env.pg_distrib_dir.to_str().unwrap(),
|
||||
)
|
||||
.env("ZENITH_REPO_DIR", self.repo_path())
|
||||
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
self.repo_path().join("pageserver.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the page server to start up. Wait until it is
|
||||
// open for business.
|
||||
for retries in 1..15 {
|
||||
let client = self.page_server_psql_client();
|
||||
if client.is_ok() {
|
||||
break;
|
||||
} else {
|
||||
println!("Pageserver not responding yet, retrying ({})...", retries);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
|
||||
println!("Pageserver started");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
let pid = read_pidfile(&self.pid_file())?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to kill pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
// wait for pageserver stop
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(self.address());
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
if let Err(_e) = stream {
|
||||
println!("Pageserver stopped");
|
||||
return Ok(());
|
||||
}
|
||||
println!("Stopping pageserver on {}", self.address());
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
self.address().port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{}'", sql);
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
self.address().port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls)
|
||||
}
|
||||
|
||||
pub fn branches_list(&self) -> Result<Vec<BranchInfo>> {
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
let query_result = client.simple_query("branch_list")?;
|
||||
let branches_json = query_result
|
||||
.first()
|
||||
.map(|msg| match msg {
|
||||
postgres::SimpleQueryMessage::Row(row) => row.get(0),
|
||||
_ => None,
|
||||
})
|
||||
.flatten()
|
||||
.ok_or_else(|| anyhow!("missing branches"))?;
|
||||
|
||||
let res: Vec<BranchInfo> = serde_json::from_str(branches_json)?;
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn branch_create(&self, name: &str, startpoint: &str) -> Result<BranchInfo> {
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
let query_result =
|
||||
client.simple_query(format!("branch_create {} {}", name, startpoint).as_str())?;
|
||||
|
||||
let branch_json = query_result
|
||||
.first()
|
||||
.map(|msg| match msg {
|
||||
postgres::SimpleQueryMessage::Row(row) => row.get(0),
|
||||
_ => None,
|
||||
})
|
||||
.flatten()
|
||||
.ok_or_else(|| anyhow!("missing branch"))?;
|
||||
|
||||
let res: BranchInfo = serde_json::from_str(branch_json).map_err(|e| {
|
||||
anyhow!(
|
||||
"failed to parse branch_create response: {}: {}",
|
||||
branch_json,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
// TODO: make this a separate request type and avoid loading all the branches
|
||||
pub fn branch_get_by_name(&self, name: &str) -> Result<BranchInfo> {
|
||||
let branch_infos = self.branches_list()?;
|
||||
let branche_by_name: Result<HashMap<String, BranchInfo>> = branch_infos
|
||||
.into_iter()
|
||||
.map(|branch_info| Ok((branch_info.name.clone(), branch_info)))
|
||||
.collect();
|
||||
let branche_by_name = branche_by_name?;
|
||||
|
||||
let branch = branche_by_name
|
||||
.get(name)
|
||||
.ok_or_else(|| anyhow!("Branch {} not found", name))?;
|
||||
|
||||
Ok(branch.clone())
|
||||
}
|
||||
|
||||
pub fn system_id_get(&self) -> Result<u64> {
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
let query_result = client
|
||||
.simple_query("identify_system")?
|
||||
.first()
|
||||
.map(|msg| match msg {
|
||||
postgres::SimpleQueryMessage::Row(row) => row.get(0),
|
||||
_ => None,
|
||||
})
|
||||
.flatten()
|
||||
.ok_or_else(|| anyhow!("failed to get system_id"))?
|
||||
.parse::<u64>()?;
|
||||
|
||||
Ok(query_result)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
if self.kill_on_exit {
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
1
integration_tests/.gitignore
vendored
Normal file
1
integration_tests/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
tmp_check/
|
||||
@@ -9,8 +9,10 @@ edition = "2018"
|
||||
[dependencies]
|
||||
lazy_static = "1.4.0"
|
||||
rand = "0.8.3"
|
||||
postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
anyhow = "1.0"
|
||||
nix = "0.20"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
|
||||
416
integration_tests/src/lib.rs
Normal file
416
integration_tests/src/lib.rs
Normal file
@@ -0,0 +1,416 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::Read;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, ExitStatus};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres;
|
||||
|
||||
use control_plane::compute::PostgresNode;
|
||||
use control_plane::read_pidfile;
|
||||
use control_plane::{local_env::LocalEnv, storage::PageServerNode};
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
fn cargo_bin_dir() -> PathBuf {
|
||||
let mut pathbuf = std::env::current_exe().unwrap();
|
||||
|
||||
pathbuf.pop();
|
||||
if pathbuf.ends_with("deps") {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
pathbuf
|
||||
}
|
||||
|
||||
// local compute env for tests
|
||||
pub fn create_test_env(testname: &str) -> LocalEnv {
|
||||
let base_path = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_check/")
|
||||
.join(testname);
|
||||
|
||||
let base_path_str = base_path.to_str().unwrap();
|
||||
|
||||
// Remove remnants of old test repo
|
||||
let _ = fs::remove_dir_all(&base_path);
|
||||
|
||||
fs::create_dir_all(&base_path)
|
||||
.expect(format!("could not create directory for {}", base_path_str).as_str());
|
||||
|
||||
let pgdatadirs_path = base_path.join("pgdatadirs");
|
||||
fs::create_dir(&pgdatadirs_path)
|
||||
.expect(format!("could not create directory {:?}", pgdatadirs_path).as_str());
|
||||
|
||||
LocalEnv {
|
||||
pageserver_connstring: "postgresql://127.0.0.1:64000".to_string(),
|
||||
pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
|
||||
zenith_distrib_dir: Some(cargo_bin_dir()),
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Collection of several example deployments useful for tests.
|
||||
//
|
||||
// I'm intendedly modelling storage and compute control planes as a separate entities
|
||||
// as it is closer to the actual setup.
|
||||
//
|
||||
pub struct TestStorageControlPlane {
|
||||
pub wal_acceptors: Vec<WalAcceptorNode>,
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
pub test_done: AtomicBool,
|
||||
}
|
||||
|
||||
impl TestStorageControlPlane {
|
||||
// postgres <-> page_server
|
||||
//
|
||||
// Initialize a new repository and configure a page server to run in it
|
||||
//
|
||||
pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
|
||||
let pserver = Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
});
|
||||
pserver.init().unwrap();
|
||||
pserver.start().unwrap();
|
||||
|
||||
TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: pserver,
|
||||
test_done: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
// postgres <-> {wal_acceptor1, wal_acceptor2, ...}
|
||||
pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
|
||||
let mut cplane = TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
}),
|
||||
test_done: AtomicBool::new(false),
|
||||
// repopath,
|
||||
};
|
||||
cplane.pageserver.init().unwrap();
|
||||
cplane.pageserver.start().unwrap();
|
||||
|
||||
let systemid = cplane.pageserver.system_id_get().unwrap();
|
||||
|
||||
const WAL_ACCEPTOR_PORT: usize = 54321;
|
||||
|
||||
let datadir_base = local_env.base_data_dir.join("safekeepers");
|
||||
fs::create_dir_all(&datadir_base).unwrap();
|
||||
|
||||
for i in 0..redundancy {
|
||||
let wal_acceptor = WalAcceptorNode {
|
||||
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
data_dir: datadir_base.join(format!("wal_acceptor_{}", i)),
|
||||
systemid,
|
||||
env: local_env.clone(),
|
||||
pass_to_pageserver: true,
|
||||
};
|
||||
wal_acceptor.init();
|
||||
wal_acceptor.start();
|
||||
cplane.wal_acceptors.push(wal_acceptor);
|
||||
}
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
for wa in self.wal_acceptors.iter() {
|
||||
let _ = wa.stop();
|
||||
}
|
||||
self.test_done.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn get_wal_acceptor_conn_info(&self) -> String {
|
||||
self.wal_acceptors
|
||||
.iter()
|
||||
.map(|wa| wa.listen.to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub fn is_running(&self) -> bool {
|
||||
self.test_done.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestStorageControlPlane {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// PostgresNodeExt
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
///
|
||||
/// Testing utilities for PostgresNode type
|
||||
///
|
||||
pub trait PostgresNodeExt {
|
||||
fn pg_regress(&self) -> ExitStatus;
|
||||
fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus;
|
||||
fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode;
|
||||
fn open_psql(&self, db: &str) -> postgres::Client;
|
||||
fn dump_log_file(&self);
|
||||
fn safe_psql(&self, db: &str, sql: &str) -> Vec<postgres::Row>;
|
||||
}
|
||||
|
||||
impl PostgresNodeExt for PostgresNode {
|
||||
fn pg_regress(&self) -> ExitStatus {
|
||||
self.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
|
||||
let regress_run_path = self.env.base_data_dir.join("regress");
|
||||
fs::create_dir_all(®ress_run_path).unwrap();
|
||||
fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGPORT", self.address.port().to_string())
|
||||
.env("PGUSER", self.whoami())
|
||||
.env("PGHOST", self.address.ip().to_string())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
if !regress_check.success() {
|
||||
if let Ok(mut file) = File::open("regression.diffs") {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- regression.diffs:\n{}", buffer);
|
||||
}
|
||||
self.dump_log_file();
|
||||
}
|
||||
regress_check
|
||||
}
|
||||
|
||||
fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus {
|
||||
let port = self.address.port().to_string();
|
||||
let clients = clients.to_string();
|
||||
let seconds = seconds.to_string();
|
||||
let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&["-i", "-p", port.as_str(), "postgres"])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench -i");
|
||||
let pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&[
|
||||
"-p",
|
||||
port.as_str(),
|
||||
"-T",
|
||||
seconds.as_str(),
|
||||
"-P",
|
||||
"1",
|
||||
"-c",
|
||||
clients.as_str(),
|
||||
"-M",
|
||||
"prepared",
|
||||
"postgres",
|
||||
])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench run");
|
||||
pg_bench_run
|
||||
}
|
||||
|
||||
fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
|
||||
let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
|
||||
match Command::new(proxy_path.as_path())
|
||||
.args(&["--ztimelineid", &self.timelineid.to_string()])
|
||||
.args(&["-s", wal_acceptors])
|
||||
.args(&["-h", &self.address.ip().to_string()])
|
||||
.args(&["-p", &self.address.port().to_string()])
|
||||
.arg("-v")
|
||||
.stderr(
|
||||
OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(self.pgdata().join("safekeeper_proxy.log"))
|
||||
.unwrap(),
|
||||
)
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => WalProposerNode { pid: child.id() },
|
||||
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
|
||||
}
|
||||
}
|
||||
|
||||
fn dump_log_file(&self) {
|
||||
if let Ok(mut file) = File::open(self.env.pageserver_data_dir().join("pageserver.log")) {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- pageserver.log:\n{}", buffer);
|
||||
}
|
||||
}
|
||||
|
||||
fn safe_psql(&self, db: &str, sql: &str) -> Vec<postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
let mut client = postgres::Client::connect(connstring.as_str(), postgres::NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
let result = client.query(sql, &[]);
|
||||
if result.is_err() {
|
||||
self.dump_log_file();
|
||||
}
|
||||
result.unwrap()
|
||||
}
|
||||
|
||||
fn open_psql(&self, db: &str) -> postgres::Client {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
postgres::Client::connect(connstring.as_str(), postgres::NoTls).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// WalAcceptorNode
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Control routines for WalAcceptor.
|
||||
//
|
||||
// Now used only in test setups.
|
||||
//
|
||||
pub struct WalAcceptorNode {
|
||||
listen: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
systemid: u64,
|
||||
env: LocalEnv,
|
||||
pass_to_pageserver: bool,
|
||||
}
|
||||
|
||||
impl WalAcceptorNode {
|
||||
pub fn init(&self) {
|
||||
if self.data_dir.exists() {
|
||||
fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!(
|
||||
"Starting wal_acceptor in {} listening '{}'",
|
||||
self.data_dir.to_str().unwrap(),
|
||||
self.listen
|
||||
);
|
||||
|
||||
let ps_arg = if self.pass_to_pageserver {
|
||||
// Tell page server it can receive WAL from this WAL safekeeper
|
||||
["--pageserver", "127.0.0.1:64000"].to_vec()
|
||||
} else {
|
||||
[].to_vec()
|
||||
};
|
||||
|
||||
let status = Command::new(
|
||||
self.env
|
||||
.zenith_distrib_dir
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.join("wal_acceptor"),
|
||||
)
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.listen.to_string().as_str()])
|
||||
.args(&["--systemid", self.systemid.to_string().as_str()])
|
||||
.args(&ps_arg)
|
||||
.arg("-d")
|
||||
.arg("-n")
|
||||
.status()
|
||||
.expect("failed to start wal_acceptor");
|
||||
|
||||
if !status.success() {
|
||||
panic!("wal_acceptor start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
println!("Stopping wal acceptor on {}", self.listen);
|
||||
let pidfile = self.data_dir.join("wal_acceptor.pid");
|
||||
let pid = read_pidfile(&pidfile)?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to kill wal_acceptor with pid {}", pid);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalAcceptorNode {
|
||||
fn drop(&mut self) {
|
||||
// Ignore errors.
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// WalProposerNode
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct WalProposerNode {
|
||||
pub pid: u32,
|
||||
}
|
||||
|
||||
impl WalProposerNode {
|
||||
pub fn stop(&self) {
|
||||
// std::process::Child::id() returns u32, we need i32.
|
||||
let pid: i32 = self.pid.try_into().unwrap();
|
||||
let pid = Pid::from_raw(pid);
|
||||
kill(pid, Signal::SIGTERM).expect("failed to execute kill");
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalProposerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
@@ -1,844 +0,0 @@
|
||||
//
|
||||
// Local control plane.
|
||||
//
|
||||
// Can start, cofigure and stop postgres instances running as a local processes.
|
||||
//
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
|
||||
use std::fs::File;
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::{
|
||||
io::Write,
|
||||
net::{IpAddr, Ipv4Addr, SocketAddr},
|
||||
};
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use postgres;
|
||||
|
||||
lazy_static! {
|
||||
// postgres would be there if it was build by 'make postgres' here in the repo
|
||||
pub static ref PG_BIN_DIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_install/bin");
|
||||
pub static ref PG_LIB_DIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_install/lib");
|
||||
|
||||
pub static ref BIN_DIR : PathBuf = cargo_bin_dir();
|
||||
|
||||
pub static ref TEST_WORKDIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tmp_check");
|
||||
}
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
pub fn cargo_bin_dir() -> PathBuf {
|
||||
let mut pathbuf = std::env::current_exe().ok().unwrap();
|
||||
|
||||
pathbuf.pop();
|
||||
if pathbuf.ends_with("deps") {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
return pathbuf;
|
||||
}
|
||||
|
||||
//
|
||||
// I'm intendedly modelling storage and compute control planes as a separate entities
|
||||
// as it is closer to the actual setup.
|
||||
//
|
||||
pub struct StorageControlPlane {
|
||||
pub wal_acceptors: Vec<WalAcceptorNode>,
|
||||
pub page_servers: Vec<PageServerNode>,
|
||||
}
|
||||
|
||||
impl StorageControlPlane {
|
||||
// postgres <-> page_server
|
||||
pub fn one_page_server(froms3: bool) -> StorageControlPlane {
|
||||
let mut cplane = StorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
page_servers: Vec::new(),
|
||||
};
|
||||
|
||||
let pserver = PageServerNode {
|
||||
page_service_addr: "127.0.0.1:65200".parse().unwrap(),
|
||||
data_dir: TEST_WORKDIR.join("pageserver"),
|
||||
};
|
||||
pserver.init();
|
||||
if froms3 {
|
||||
pserver.start_froms3();
|
||||
} else {
|
||||
pserver.start();
|
||||
}
|
||||
|
||||
cplane.page_servers.push(pserver);
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn fault_tolerant(redundancy: usize) -> StorageControlPlane {
|
||||
let mut cplane = StorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
page_servers: Vec::new(),
|
||||
};
|
||||
const WAL_ACCEPTOR_PORT: usize = 54321;
|
||||
|
||||
for i in 0..redundancy {
|
||||
let wal_acceptor = WalAcceptorNode {
|
||||
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
data_dir: TEST_WORKDIR.join(format!("wal_acceptor_{}", i)),
|
||||
};
|
||||
wal_acceptor.init();
|
||||
wal_acceptor.start();
|
||||
cplane.wal_acceptors.push(wal_acceptor);
|
||||
}
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
for wa in self.wal_acceptors.iter() {
|
||||
wa.stop();
|
||||
}
|
||||
}
|
||||
|
||||
// // postgres <-> wal_acceptor x3 <-> page_server
|
||||
// fn local(&mut self) -> StorageControlPlane {
|
||||
// }
|
||||
|
||||
pub fn page_server_addr(&self) -> &SocketAddr {
|
||||
&self.page_servers[0].page_service_addr
|
||||
}
|
||||
|
||||
pub fn get_wal_acceptor_conn_info(&self) -> String {
|
||||
self.wal_acceptors
|
||||
.iter()
|
||||
.map(|wa| wa.listen.to_string().to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let addr = &self.page_servers[0].page_service_addr;
|
||||
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
addr.ip(),
|
||||
addr.port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{}'", sql);
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StorageControlPlane {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageServerNode {
|
||||
page_service_addr: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PageServerNode {
|
||||
// TODO: method to force redo on a specific relation
|
||||
|
||||
// TODO: make wal-redo-postgres workable without data directory?
|
||||
pub fn init(&self) {
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
|
||||
let datadir_path = self.data_dir.join("wal_redo_pgdata");
|
||||
fs::remove_dir_all(datadir_path.to_str().unwrap()).ok();
|
||||
|
||||
let initdb = Command::new(PG_BIN_DIR.join("initdb"))
|
||||
.args(&["-D", datadir_path.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!("Starting pageserver at '{}'", self.page_service_addr);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("pageserver"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.page_service_addr.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.arg("--skip-recovery")
|
||||
.env_clear()
|
||||
.env("PATH", PG_BIN_DIR.to_str().unwrap()) // path to postres-wal-redo binary
|
||||
.status()
|
||||
.expect("failed to start pageserver");
|
||||
|
||||
if !status.success() {
|
||||
panic!("pageserver start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_froms3(&self) {
|
||||
println!("Starting pageserver at '{}'", self.page_service_addr);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("pageserver"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.page_service_addr.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("PATH", PG_BIN_DIR.to_str().unwrap()) // path to postres-wal-redo binary
|
||||
.env("S3_ENDPOINT", "https://127.0.0.1:9000")
|
||||
.env("S3_REGION", "us-east-1")
|
||||
.env("S3_ACCESSKEY", "minioadmin")
|
||||
.env("S3_SECRET", "minioadmin")
|
||||
.status()
|
||||
.expect("failed to start pageserver");
|
||||
|
||||
if !status.success() {
|
||||
panic!("pageserver start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
let pidfile = self.data_dir.join("pageserver.pid");
|
||||
let pid = fs::read_to_string(pidfile).unwrap();
|
||||
let status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WalAcceptorNode {
|
||||
listen: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl WalAcceptorNode {
|
||||
pub fn init(&self) {
|
||||
if self.data_dir.exists() {
|
||||
fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!(
|
||||
"Starting wal_acceptor in {} listening '{}'",
|
||||
self.data_dir.to_str().unwrap(),
|
||||
self.listen
|
||||
);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("wal_acceptor"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.listen.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.arg("-n")
|
||||
.status()
|
||||
.expect("failed to start wal_acceptor");
|
||||
|
||||
if !status.success() {
|
||||
panic!("wal_acceptor start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
let pidfile = self.data_dir.join("wal_acceptor.pid");
|
||||
if let Ok(pid) = fs::read_to_string(pidfile) {
|
||||
let _status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalAcceptorNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
//
|
||||
pub struct ComputeControlPlane<'a> {
|
||||
pg_bin_dir: PathBuf,
|
||||
work_dir: PathBuf,
|
||||
last_assigned_port: u16,
|
||||
storage_cplane: &'a StorageControlPlane,
|
||||
nodes: Vec<Arc<PostgresNode>>,
|
||||
}
|
||||
|
||||
impl ComputeControlPlane<'_> {
|
||||
pub fn local(storage_cplane: &StorageControlPlane) -> ComputeControlPlane {
|
||||
ComputeControlPlane {
|
||||
pg_bin_dir: PG_BIN_DIR.to_path_buf(),
|
||||
work_dir: TEST_WORKDIR.to_path_buf(),
|
||||
last_assigned_port: 65431,
|
||||
storage_cplane: storage_cplane,
|
||||
nodes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: check port availability and
|
||||
fn get_port(&mut self) -> u16 {
|
||||
let port = self.last_assigned_port + 1;
|
||||
self.last_assigned_port += 1;
|
||||
port
|
||||
}
|
||||
|
||||
pub fn new_vanilla_node<'a>(&mut self) -> &Arc<PostgresNode> {
|
||||
// allocate new node entry with generated port
|
||||
let node_id = self.nodes.len() + 1;
|
||||
let node = PostgresNode {
|
||||
_node_id: node_id,
|
||||
port: self.get_port(),
|
||||
ip: IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)),
|
||||
pgdata: self.work_dir.join(format!("compute/pg{}", node_id)),
|
||||
pg_bin_dir: self.pg_bin_dir.clone(),
|
||||
};
|
||||
self.nodes.push(Arc::new(node));
|
||||
let node = self.nodes.last().unwrap();
|
||||
|
||||
// initialize data directory
|
||||
fs::remove_dir_all(node.pgdata.to_str().unwrap()).ok();
|
||||
let initdb_path = self.pg_bin_dir.join("initdb");
|
||||
println!("initdb_path: {}", initdb_path.to_str().unwrap());
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", node.pgdata.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
|
||||
// // allow local replication connections
|
||||
// node.append_conf("pg_hba.conf", format!("\
|
||||
// host replication all {}/32 sspi include_realm=1 map=regress\n\
|
||||
// ", node.ip).as_str());
|
||||
|
||||
// listen for selected port
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
max_connections = 100\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n\
|
||||
",
|
||||
address = node.ip,
|
||||
port = node.port
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
// Init compute node without files, only datadir structure
|
||||
// use initdb --compute-node flag and GUC 'computenode_mode'
|
||||
// to distinguish the node
|
||||
pub fn new_minimal_node<'a>(&mut self) -> &Arc<PostgresNode> {
|
||||
// allocate new node entry with generated port
|
||||
let node_id = self.nodes.len() + 1;
|
||||
let node = PostgresNode {
|
||||
_node_id: node_id,
|
||||
port: self.get_port(),
|
||||
ip: IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)),
|
||||
pgdata: self.work_dir.join(format!("compute/pg{}", node_id)),
|
||||
pg_bin_dir: self.pg_bin_dir.clone(),
|
||||
};
|
||||
self.nodes.push(Arc::new(node));
|
||||
let node = self.nodes.last().unwrap();
|
||||
|
||||
// initialize data directory
|
||||
fs::remove_dir_all(node.pgdata.to_str().unwrap()).ok();
|
||||
let initdb_path = self.pg_bin_dir.join("initdb");
|
||||
println!("initdb_path: {}", initdb_path.to_str().unwrap());
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", node.pgdata.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.arg("--compute-node")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
|
||||
// // allow local replication connections
|
||||
// node.append_conf("pg_hba.conf", format!("\
|
||||
// host replication all {}/32 sspi include_realm=1 map=regress\n\
|
||||
// ", node.ip).as_str());
|
||||
|
||||
// listen for selected port
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
max_connections = 100\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n\
|
||||
computenode_mode = true\n\
|
||||
",
|
||||
address = node.ip,
|
||||
port = node.port
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_node_wo_data(&mut self) -> Arc<PostgresNode> {
|
||||
let storage_cplane = self.storage_cplane;
|
||||
let node = self.new_minimal_node();
|
||||
|
||||
let pserver = storage_cplane.page_server_addr();
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
page_server_connstring = 'host={} port={}'\n\
|
||||
",
|
||||
pserver.ip(),
|
||||
pserver.port()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node.clone()
|
||||
}
|
||||
|
||||
pub fn new_node(&mut self) -> Arc<PostgresNode> {
|
||||
let storage_cplane = self.storage_cplane;
|
||||
let node = self.new_vanilla_node();
|
||||
|
||||
let pserver = storage_cplane.page_server_addr();
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
page_server_connstring = 'host={} port={}'\n\
|
||||
",
|
||||
pserver.ip(),
|
||||
pserver.port()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node.clone()
|
||||
}
|
||||
|
||||
pub fn new_master_node(&mut self) -> Arc<PostgresNode> {
|
||||
let node = self.new_vanilla_node();
|
||||
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
"synchronous_standby_names = 'safekeeper_proxy'\n\
|
||||
",
|
||||
);
|
||||
node.clone()
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct WalProposerNode {
|
||||
pid: u32,
|
||||
}
|
||||
|
||||
impl WalProposerNode {
|
||||
pub fn stop(&self) {
|
||||
let status = Command::new("kill")
|
||||
.arg(self.pid.to_string())
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalProposerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct PostgresNode {
|
||||
_node_id: usize,
|
||||
pub port: u16,
|
||||
pub ip: IpAddr,
|
||||
pgdata: PathBuf,
|
||||
pg_bin_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
pub fn append_conf(&self, config: &str, opts: &str) {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata.join(config).to_str().unwrap())
|
||||
.unwrap()
|
||||
.write_all(opts.as_bytes())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str], check_ok: bool) {
|
||||
let pg_ctl_path = self.pg_bin_dir.join("pg_ctl");
|
||||
let pg_ctl = Command::new(pg_ctl_path)
|
||||
.args(
|
||||
[
|
||||
&[
|
||||
"-D",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
"-l",
|
||||
self.pgdata.join("log").to_str().unwrap(),
|
||||
],
|
||||
args,
|
||||
]
|
||||
.concat(),
|
||||
)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute pg_ctl");
|
||||
|
||||
if check_ok && !pg_ctl.success() {
|
||||
panic!("pg_ctl failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self, storage_cplane: &StorageControlPlane) {
|
||||
if storage_cplane.page_servers.len() != 0 {
|
||||
let _res =
|
||||
storage_cplane.page_server_psql(format!("callmemaybe {}", self.connstr()).as_str());
|
||||
}
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"], true);
|
||||
}
|
||||
|
||||
pub fn restart(&self) {
|
||||
self.pg_ctl(&["restart"], true);
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"], true);
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
format!("host={} port={} user={}", self.ip, self.port, self.whoami())
|
||||
}
|
||||
|
||||
// XXX: cache that in control plane
|
||||
pub fn whoami(&self) -> String {
|
||||
let output = Command::new("whoami")
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.ip,
|
||||
self.port,
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
client.query(sql, &[]).unwrap()
|
||||
}
|
||||
|
||||
pub fn open_psql(&self, db: &str) -> Client {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.ip,
|
||||
self.port,
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls).unwrap()
|
||||
}
|
||||
|
||||
pub fn get_pgdata(&self) -> Option<&str> {
|
||||
self.pgdata.to_str()
|
||||
}
|
||||
|
||||
// Request from pageserver stub controlfile, respective xlog
|
||||
// and a bunch of files needed to start computenode
|
||||
//
|
||||
// NOTE this "file" request is a crutch.
|
||||
// It asks pageserver to write requested page to the provided filepath
|
||||
// and thus only works locally.
|
||||
// TODO receive pages via some libpq protocol.
|
||||
// The problem I've met is that nonrelfiles are not valid utf8 and cannot be
|
||||
// handled by simple_query(). that expects test.
|
||||
// And reqular query() uses prepared queries.
|
||||
|
||||
// TODO pass sysid as parameter
|
||||
pub fn setup_compute_node(&self, sysid: u64, storage_cplane: &StorageControlPlane) {
|
||||
let mut query;
|
||||
//Request pg_control from pageserver
|
||||
query = format!(
|
||||
"file {}/global/pg_control,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
1664, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
42, //forknum pg_control
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
//Request pg_xact and pg_multixact from pageserver
|
||||
//We need them for initial pageserver startup and authentication
|
||||
//TODO figure out which block number we really need
|
||||
query = format!(
|
||||
"file {}/pg_xact/0000,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
0, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
44, //forknum
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
query = format!(
|
||||
"file {}/pg_multixact/offsets/0000,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
0, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
45, //forknum
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
query = format!(
|
||||
"file {}/pg_multixact/members/0000,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
0, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
46, //forknum
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
//Request a few shared catalogs needed for authentication
|
||||
//Without them we cannot setup connection with pageserver to request further pages
|
||||
let reloids = [1260, 1261, 1262, 2396];
|
||||
for reloid in reloids.iter() {
|
||||
//FIXME request all blocks from file, not just 10
|
||||
for blkno in 0..10 {
|
||||
query = format!(
|
||||
"file {}/global/{},{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
reloid, //suse it as filename
|
||||
sysid as u64, //sysid
|
||||
1664, //tablespace
|
||||
0, //dboid
|
||||
reloid, //reloid
|
||||
0, //forknum
|
||||
blkno, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
}
|
||||
}
|
||||
|
||||
fs::create_dir(format!("{}/base/13006", self.pgdata.to_str().unwrap())).unwrap();
|
||||
fs::create_dir(format!("{}/base/13007", self.pgdata.to_str().unwrap())).unwrap();
|
||||
|
||||
//FIXME figure out what wal file we need to successfully start
|
||||
let walfilepath = format!(
|
||||
"{}/pg_wal/000000010000000000000001",
|
||||
self.pgdata.to_str().unwrap()
|
||||
);
|
||||
fs::copy(
|
||||
"/home/anastasia/zenith/zenith/tmp_check/pgdata/pg_wal/000000010000000000000001",
|
||||
walfilepath,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
println!("before resetwal ");
|
||||
|
||||
let pg_resetwal_path = self.pg_bin_dir.join("pg_resetwal");
|
||||
|
||||
// Now it does nothing, just prints existing content of pg_control.
|
||||
// TODO update values with most recent lsn, xid, oid requested from pageserver
|
||||
let pg_resetwal = Command::new(pg_resetwal_path)
|
||||
.args(&["-D", self.pgdata.to_str().unwrap()])
|
||||
.arg("-n") //dry run
|
||||
//.arg("-f")
|
||||
//.args(&["--next-transaction-id", "100500"])
|
||||
//.args(&["--next-oid", "17000"])
|
||||
//.args(&["--next-transaction-id", "100500"])
|
||||
.status()
|
||||
.expect("failed to execute pg_resetwal");
|
||||
|
||||
if !pg_resetwal.success() {
|
||||
panic!("pg_resetwal failed");
|
||||
}
|
||||
|
||||
println!("setup done");
|
||||
}
|
||||
|
||||
pub fn start_proxy(&self, wal_acceptors: String) -> WalProposerNode {
|
||||
let proxy_path = PG_BIN_DIR.join("safekeeper_proxy");
|
||||
match Command::new(proxy_path.as_path())
|
||||
.args(&["-s", &wal_acceptors])
|
||||
.args(&["-h", &self.ip.to_string()])
|
||||
.args(&["-p", &self.port.to_string()])
|
||||
.arg("-v")
|
||||
.stderr(File::create(TEST_WORKDIR.join("safepkeeper_proxy.log")).unwrap())
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => WalProposerNode { pid: child.id() },
|
||||
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_to_s3(&self) {
|
||||
println!("Push to s3 node at '{}'", self.pgdata.to_str().unwrap());
|
||||
|
||||
let zenith_push_path = self.pg_bin_dir.join("zenith_push");
|
||||
println!("zenith_push_path: {}", zenith_push_path.to_str().unwrap());
|
||||
|
||||
let status = Command::new(zenith_push_path)
|
||||
.args(&["-D", self.pgdata.to_str().unwrap()])
|
||||
.env_clear()
|
||||
.env("S3_ENDPOINT", "https://127.0.0.1:9000")
|
||||
.env("S3_REGION", "us-east-1")
|
||||
.env("S3_ACCESSKEY", "minioadmin")
|
||||
.env("S3_SECRET", "minioadmin")
|
||||
// .env("S3_BUCKET", "zenith-testbucket")
|
||||
.status()
|
||||
.expect("failed to push node to s3");
|
||||
|
||||
if !status.success() {
|
||||
panic!("zenith_push failed");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO
|
||||
pub fn pg_bench() {}
|
||||
pub fn pg_regress() {}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
// destructor to clean up state after test is done
|
||||
// XXX: we may detect failed test by setting some flag in catch_unwind()
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.pgdata.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn regress_check(pg: &PostgresNode) {
|
||||
pg.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
|
||||
let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress");
|
||||
fs::create_dir_all(regress_run_path.clone()).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let _regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", PG_BIN_DIR.to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.env("PGPORT", pg.port.to_string())
|
||||
.env("PGUSER", pg.whoami())
|
||||
.env("PGHOST", pg.ip.to_string())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
// test node resettlement to an empty datadir
|
||||
#[test]
|
||||
fn test_resettlement() {}
|
||||
|
||||
// test seq scan of everythin after restart
|
||||
#[test]
|
||||
fn test_cold_seqscan() {}
|
||||
@@ -1,5 +0,0 @@
|
||||
#[test]
|
||||
fn test_actions() {}
|
||||
|
||||
#[test]
|
||||
fn test_regress() {}
|
||||
@@ -1,210 +0,0 @@
|
||||
#[allow(dead_code)]
|
||||
mod control_plane;
|
||||
use std::thread::sleep;
|
||||
use std::time::Duration;
|
||||
|
||||
use control_plane::ComputeControlPlane;
|
||||
use control_plane::StorageControlPlane;
|
||||
|
||||
// XXX: force all redo at the end
|
||||
// -- restart + seqscan won't read deleted stuff
|
||||
// -- pageserver api endpoint to check all rels
|
||||
|
||||
//Handcrafted cases with wal records that are (were) problematic for redo.
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_redo_cases() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// start postgres
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
println!("await pageserver connection...");
|
||||
sleep(Duration::from_secs(3));
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5050);
|
||||
|
||||
//check 'create table as'
|
||||
node.safe_psql("postgres", "CREATE TABLE t2 AS SELECT * FROM t");
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5050);
|
||||
}
|
||||
|
||||
// Runs pg_regress on a compute node
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_regress() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// start postgres
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
println!("await pageserver connection...");
|
||||
sleep(Duration::from_secs(3));
|
||||
|
||||
control_plane::regress_check(&node);
|
||||
}
|
||||
|
||||
// Run two postgres instances on one pageserver
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_pageserver_multitenancy() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// Allocate postgres instance, but don't start
|
||||
let node1 = compute_cplane.new_node();
|
||||
let node2 = compute_cplane.new_node();
|
||||
node1.start(&storage_cplane);
|
||||
node2.start(&storage_cplane);
|
||||
|
||||
// XXX: add some extension func to postgres to check walsender conn
|
||||
// XXX: or better just drop that
|
||||
println!("await pageserver connection...");
|
||||
sleep(Duration::from_secs(3));
|
||||
|
||||
// check node1
|
||||
node1.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node1.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
|
||||
);
|
||||
let count: i64 = node1
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5050);
|
||||
|
||||
// check node2
|
||||
node2.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node2.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(100,200), 'payload'",
|
||||
);
|
||||
let count: i64 = node2
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 15150);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
// Start pageserver using s3 base image
|
||||
//
|
||||
// Requires working minio with hardcoded setup:
|
||||
// .env("S3_ENDPOINT", "https://127.0.0.1:9000")
|
||||
// .env("S3_REGION", "us-east-1")
|
||||
// .env("S3_ACCESSKEY", "minioadmin")
|
||||
// .env("S3_SECRET", "minioadmin")
|
||||
// .env("S3_BUCKET", "zenith-testbucket")
|
||||
// TODO use env variables in test
|
||||
fn test_pageserver_recovery() {
|
||||
//This test expects that image is already uploaded to s3
|
||||
//To upload it use zenith_push before test (see node.push_to_s3() for details)
|
||||
let storage_cplane = StorageControlPlane::one_page_server(true);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
//Wait while daemon uploads pages from s3
|
||||
sleep(Duration::from_secs(15));
|
||||
|
||||
let node_restored = compute_cplane.new_node_wo_data();
|
||||
|
||||
//TODO 6947041219207877724 is a hardcoded sysid for my cluster. Get it somewhere
|
||||
node_restored.setup_compute_node(6947041219207877724, &storage_cplane);
|
||||
|
||||
node_restored.start(&storage_cplane);
|
||||
|
||||
let rows = node_restored.safe_psql("postgres", "SELECT relname from pg_class;");
|
||||
|
||||
assert_eq!(rows.len(), 395);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
//Scenario for future test. Not implemented yet
|
||||
fn test_pageserver_node_switch() {
|
||||
//Create pageserver
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
//Create reqular node
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5050);
|
||||
|
||||
//Push all node files to s3
|
||||
//TODO upload them directly to pageserver
|
||||
node.push_to_s3();
|
||||
//Upload data from s3 to pageserver
|
||||
//storage_cplane.upload_from_s3() //Not implemented yet
|
||||
|
||||
//Shut down the node
|
||||
node.stop();
|
||||
|
||||
//Create new node without files
|
||||
let node_restored = compute_cplane.new_node_wo_data();
|
||||
|
||||
// Setup minimal set of files needed to start node and setup pageserver connection
|
||||
// TODO 6947041219207877724 is a hardcoded sysid. Get it from node
|
||||
node_restored.setup_compute_node(6947041219207877724, &storage_cplane);
|
||||
|
||||
//Start compute node without files
|
||||
node_restored.start(&storage_cplane);
|
||||
|
||||
//Ensure that is has table created on initial node
|
||||
let rows = node_restored.safe_psql("postgres", "SELECT key from t;");
|
||||
assert_eq!(rows.len(), 5050);
|
||||
}
|
||||
@@ -1,28 +1,41 @@
|
||||
// Restart acceptors one by one while compute is under the load.
|
||||
#[allow(dead_code)]
|
||||
mod control_plane;
|
||||
use control_plane::ComputeControlPlane;
|
||||
use control_plane::StorageControlPlane;
|
||||
|
||||
use rand::Rng;
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use std::{thread, time};
|
||||
|
||||
use control_plane::compute::{ComputeControlPlane, PostgresNode};
|
||||
|
||||
use integration_tests;
|
||||
use integration_tests::PostgresNodeExt;
|
||||
use integration_tests::TestStorageControlPlane;
|
||||
|
||||
const DOWNTIME: u64 = 2;
|
||||
|
||||
fn start_node_with_wal_proposer(
|
||||
timeline: &str,
|
||||
compute_cplane: &mut ComputeControlPlane,
|
||||
wal_acceptors: &String,
|
||||
) -> Arc<PostgresNode> {
|
||||
let node = compute_cplane.new_test_master_node(timeline);
|
||||
let _node = node.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!("wal_acceptors='{}'\n", wal_acceptors),
|
||||
);
|
||||
node.start().unwrap();
|
||||
node
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_acceptors_normal_work() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
fn test_embedded_wal_proposer() {
|
||||
let local_env = integration_tests::create_test_env("test_embedded_wal_proposer");
|
||||
|
||||
const REDUNDANCY: usize = 3;
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
// start postgres
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -43,24 +56,114 @@ fn test_acceptors_normal_work() {
|
||||
// check wal files equality
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_acceptors_normal_work() {
|
||||
let local_env = integration_tests::create_test_env("test_acceptors_normal_work");
|
||||
|
||||
const REDUNDANCY: usize = 3;
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
// check wal files equality
|
||||
}
|
||||
|
||||
// Run page server and multiple safekeepers, and multiple compute nodes running
|
||||
// against different timelines.
|
||||
#[test]
|
||||
fn test_many_timelines() {
|
||||
// Initialize a new repository, and set up WAL safekeepers and page server.
|
||||
const REDUNDANCY: usize = 3;
|
||||
const N_TIMELINES: usize = 5;
|
||||
let local_env = integration_tests::create_test_env("test_many_timelines");
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// Create branches
|
||||
let mut timelines: Vec<String> = Vec::new();
|
||||
timelines.push("main".to_string());
|
||||
|
||||
for i in 1..N_TIMELINES {
|
||||
let branchname = format!("experimental{}", i);
|
||||
storage_cplane
|
||||
.pageserver
|
||||
.branch_create(&branchname, "main")
|
||||
.unwrap();
|
||||
timelines.push(branchname);
|
||||
}
|
||||
|
||||
// start postgres on each timeline
|
||||
let mut nodes = Vec::new();
|
||||
for tli_name in timelines {
|
||||
let node = start_node_with_wal_proposer(&tli_name, &mut compute_cplane, &wal_acceptors);
|
||||
nodes.push(node.clone());
|
||||
}
|
||||
|
||||
// create schema
|
||||
for node in &nodes {
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
}
|
||||
|
||||
// Populate data
|
||||
for node in &nodes {
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
}
|
||||
|
||||
// Check data
|
||||
for node in &nodes {
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
}
|
||||
}
|
||||
|
||||
// Majority is always alive
|
||||
#[test]
|
||||
fn test_acceptors_restarts() {
|
||||
let local_env = integration_tests::create_test_env("test_acceptors_restarts");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
const FAULT_PROBABILITY: f32 = 0.01;
|
||||
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
// start postgres
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
let mut failed_node: Option<usize> = None;
|
||||
|
||||
// check basic work with table
|
||||
@@ -80,7 +183,7 @@ fn test_acceptors_restarts() {
|
||||
} else {
|
||||
let node: usize = rng.gen_range(0..REDUNDANCY);
|
||||
failed_node = Some(node);
|
||||
storage_cplane.wal_acceptors[node].stop();
|
||||
storage_cplane.wal_acceptors[node].stop().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -93,10 +196,10 @@ fn test_acceptors_restarts() {
|
||||
assert_eq!(count, 500500);
|
||||
}
|
||||
|
||||
fn start_acceptor(cplane: &Arc<StorageControlPlane>, no: usize) {
|
||||
fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
|
||||
let cp = cplane.clone();
|
||||
thread::spawn(move || {
|
||||
thread::sleep(time::Duration::from_secs(1));
|
||||
thread::sleep(time::Duration::from_secs(DOWNTIME));
|
||||
cp.wal_acceptors[no].start();
|
||||
});
|
||||
}
|
||||
@@ -105,20 +208,18 @@ fn start_acceptor(cplane: &Arc<StorageControlPlane>, no: usize) {
|
||||
// them again and check that nothing was losed. Repeat.
|
||||
// N_CRASHES env var
|
||||
#[test]
|
||||
fn test_acceptors_unavalability() {
|
||||
fn test_acceptors_unavailability() {
|
||||
let local_env = integration_tests::create_test_env("test_acceptors_unavailability");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 2;
|
||||
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
// start postgres
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -129,21 +230,26 @@ fn test_acceptors_unavalability() {
|
||||
psql.execute("INSERT INTO t values (1, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
storage_cplane.wal_acceptors[0].stop();
|
||||
// Shut down all wal acceptors
|
||||
storage_cplane.wal_acceptors[0].stop().unwrap();
|
||||
let cp = Arc::new(storage_cplane);
|
||||
start_acceptor(&cp, 0);
|
||||
let now = SystemTime::now();
|
||||
psql.execute("INSERT INTO t values (2, 'payload')", &[])
|
||||
.unwrap();
|
||||
assert!(now.elapsed().unwrap().as_secs() > 1);
|
||||
// Here we check that the query above was hanging
|
||||
// while wal_acceptor was unavailiable
|
||||
assert!(now.elapsed().unwrap().as_secs() >= DOWNTIME);
|
||||
psql.execute("INSERT INTO t values (3, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
cp.wal_acceptors[1].stop();
|
||||
cp.wal_acceptors[1].stop().unwrap();
|
||||
start_acceptor(&cp, 1);
|
||||
psql.execute("INSERT INTO t values (4, 'payload')", &[])
|
||||
.unwrap();
|
||||
assert!(now.elapsed().unwrap().as_secs() > 2);
|
||||
// Here we check that the query above was hanging
|
||||
// while wal_acceptor was unavailiable
|
||||
assert!(now.elapsed().unwrap().as_secs() >= 2 * DOWNTIME);
|
||||
|
||||
psql.execute("INSERT INTO t values (5, 'payload')", &[])
|
||||
.unwrap();
|
||||
@@ -154,19 +260,21 @@ fn test_acceptors_unavalability() {
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
// Ensure that all inserts succeeded.
|
||||
// Including ones that were waiting for wal acceptor restart.
|
||||
assert_eq!(count, 15);
|
||||
}
|
||||
|
||||
fn simulate_failures(cplane: &Arc<StorageControlPlane>) {
|
||||
fn simulate_failures(cplane: Arc<TestStorageControlPlane>) {
|
||||
let mut rng = rand::thread_rng();
|
||||
let n_acceptors = cplane.wal_acceptors.len();
|
||||
let failure_period = time::Duration::from_secs(1);
|
||||
loop {
|
||||
while cplane.is_running() {
|
||||
thread::sleep(failure_period);
|
||||
let mask: u32 = rng.gen_range(0..(1 << n_acceptors));
|
||||
for i in 0..n_acceptors {
|
||||
if (mask & (1 << i)) != 0 {
|
||||
cplane.wal_acceptors[i].stop();
|
||||
cplane.wal_acceptors[i].stop().unwrap();
|
||||
}
|
||||
}
|
||||
thread::sleep(failure_period);
|
||||
@@ -181,29 +289,29 @@ fn simulate_failures(cplane: &Arc<StorageControlPlane>) {
|
||||
// Race condition test
|
||||
#[test]
|
||||
fn test_race_conditions() {
|
||||
let local_env = integration_tests::create_test_env("test_race_conditions");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(
|
||||
&local_env, REDUNDANCY,
|
||||
));
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
// start postgres
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
let cplane = Arc::new(storage_cplane);
|
||||
let cp = cplane.clone();
|
||||
thread::spawn(move || {
|
||||
simulate_failures(&cp);
|
||||
|
||||
let cp = storage_cplane.clone();
|
||||
let failures_thread = thread::spawn(move || {
|
||||
simulate_failures(cp);
|
||||
});
|
||||
|
||||
let mut psql = node.open_psql("postgres");
|
||||
@@ -218,5 +326,7 @@ fn test_race_conditions() {
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 500500);
|
||||
cplane.stop();
|
||||
|
||||
storage_cplane.stop();
|
||||
failures_thread.join().unwrap();
|
||||
}
|
||||
|
||||
23
mgmt-console/.gitignore
vendored
23
mgmt-console/.gitignore
vendored
@@ -1,23 +0,0 @@
|
||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
|
||||
# dependencies
|
||||
/node_modules
|
||||
/.pnp
|
||||
.pnp.js
|
||||
|
||||
# testing
|
||||
/coverage
|
||||
|
||||
# production
|
||||
/build
|
||||
|
||||
# misc
|
||||
.DS_Store
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
@@ -1,55 +0,0 @@
|
||||
Mock implementation of a management console.
|
||||
|
||||
See demo-howto.txt for usage.
|
||||
|
||||
Building and Installation
|
||||
-------------------------
|
||||
|
||||
To compile Postgres:
|
||||
sudo apt build-dep postgresql
|
||||
sudo apt install bison flex libz-dev libssl-dev
|
||||
sudo apt install ccache
|
||||
sudo apt install libcurl4-openssl-dev libxml2-dev
|
||||
|
||||
For the webapp:
|
||||
# NOTE: This requires at least version 1.1.0 of python3-flask. That's not
|
||||
# available in Debian Buster, need at least Bullseye.
|
||||
|
||||
sudo apt install python3 python3-flask python3-pip npm webpack
|
||||
pip3 install Flask-BasicAuth
|
||||
pip3 install boto3
|
||||
|
||||
git clone and compile and install patched version of Postgres:
|
||||
|
||||
git clone https://github.com/libzenith/postgres.git
|
||||
cd postgres
|
||||
git checkout zenith-experiments
|
||||
./configure --enable-debug --enable-cassert --with-openssl --prefix=/home/heikki/pgsql-install --with-libxml CC="ccache gcc" CFLAGS="-O0"
|
||||
make -j4 -s install
|
||||
|
||||
Get the webapp:
|
||||
cd ~
|
||||
git clone https://github.com/libzenith/zenith-mgmt-console.git
|
||||
cd zenith-mgmt-console
|
||||
mkdir pgdatadirs
|
||||
|
||||
|
||||
openssl req -new -x509 -days 365 -nodes -text -out server.crt \
|
||||
-keyout server.key -subj "/CN=zenith-demo"
|
||||
|
||||
For Mock S3 server (unless you want to test against a real cloud service):
|
||||
sudo apt install python3-tornado
|
||||
|
||||
cd ~/zenith-mgmt-console
|
||||
git clone https://github.com/hlinnaka/ms3.git
|
||||
|
||||
Compile & run it:
|
||||
npm install
|
||||
webpack # compile React app
|
||||
|
||||
BASIC_AUTH_PASSWORD=<password> ./launch-local.sh
|
||||
|
||||
|
||||
You can view the contents of the S3 bucket with browser:
|
||||
|
||||
http://<server>/list_bucket
|
||||
@@ -1,340 +0,0 @@
|
||||
from flask import request
|
||||
from flask_basicauth import BasicAuth
|
||||
from flask import render_template
|
||||
from subprocess import PIPE, STDOUT, run, Popen
|
||||
import html
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import logging
|
||||
import time
|
||||
|
||||
import boto3
|
||||
from boto3.session import Session
|
||||
from botocore.client import Config
|
||||
from botocore.handlers import set_list_objects_encoding_type_url
|
||||
|
||||
from flask import Flask
|
||||
|
||||
import waldump
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
app.config['BASIC_AUTH_USERNAME'] = 'zenith'
|
||||
app.config['BASIC_AUTH_PASSWORD'] = os.getenv('BASIC_AUTH_PASSWORD')
|
||||
app.config['BASIC_AUTH_FORCE'] = True
|
||||
|
||||
basic_auth = BasicAuth(app)
|
||||
|
||||
# S3 configuration:
|
||||
|
||||
ENDPOINT = os.getenv('S3_ENDPOINT', 'https://localhost:9000')
|
||||
ACCESS_KEY = os.getenv('S3_ACCESSKEY', 'minioadmin')
|
||||
SECRET = os.getenv('S3_SECRET', '')
|
||||
BUCKET = os.getenv('S3_BUCKET', 'foobucket')
|
||||
|
||||
print("Using bucket at " + ENDPOINT);
|
||||
|
||||
#boto3.set_stream_logger('botocore', logging.DEBUG)
|
||||
|
||||
session = Session(aws_access_key_id=ACCESS_KEY,
|
||||
aws_secret_access_key=SECRET,
|
||||
region_name=os.getenv('S3_REGION', 'auto'))
|
||||
|
||||
# needed for google cloud?
|
||||
session.events.unregister('before-parameter-build.s3.ListObjects',
|
||||
set_list_objects_encoding_type_url)
|
||||
|
||||
s3resource = session.resource('s3',
|
||||
endpoint_url=ENDPOINT,
|
||||
verify=False,
|
||||
config=Config(signature_version='s3v4'))
|
||||
s3bucket = s3resource.Bucket(BUCKET)
|
||||
|
||||
s3_client = boto3.client('s3',
|
||||
endpoint_url=ENDPOINT,
|
||||
verify=False,
|
||||
config=Config(signature_version='s3v4'),
|
||||
aws_access_key_id=ACCESS_KEY,
|
||||
aws_secret_access_key=SECRET)
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
return render_template("index.html")
|
||||
|
||||
|
||||
@app.route("/api/waldump")
|
||||
def render_waldump():
|
||||
return render_template("waldump.html")
|
||||
|
||||
@app.route('/api/fetch_wal')
|
||||
def fetch_wal():
|
||||
return waldump.fetch_wal(request, s3bucket);
|
||||
|
||||
@app.route("/api/server_status")
|
||||
def server_status():
|
||||
dirs = os.listdir("pgdatadirs")
|
||||
dirs.sort()
|
||||
|
||||
primary = None
|
||||
standbys = []
|
||||
|
||||
for dirname in dirs:
|
||||
|
||||
result = run("pg_ctl status -D pgdatadirs/" + dirname, stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
srv = {
|
||||
'datadir': dirname,
|
||||
'status': result.stdout,
|
||||
'port': None
|
||||
}
|
||||
|
||||
if dirname == 'primary':
|
||||
primary = srv;
|
||||
primary['port'] = 5432;
|
||||
else:
|
||||
standby_match = re.search('standby_([0-9]+)', dirname)
|
||||
if standby_match:
|
||||
srv['port'] = int(standby_match.group(1))
|
||||
|
||||
standbys.append(srv);
|
||||
|
||||
return {'primary': primary, 'standbys': standbys}
|
||||
|
||||
@app.route('/api/list_bucket')
|
||||
def list_bucket():
|
||||
|
||||
response = 'cloud bucket contents:<br>\n'
|
||||
|
||||
for file in s3bucket.objects.all():
|
||||
response = response + html.escape(file.key) + '<br>\n'
|
||||
|
||||
return response
|
||||
|
||||
def walpos_str(walpos):
|
||||
return '{:X}/{:X}'.format(walpos >> 32, walpos & 0xFFFFFFFF)
|
||||
|
||||
@app.route('/api/bucket_summary')
|
||||
def bucket_summary():
|
||||
|
||||
nonrelimages = []
|
||||
minwal = int(0)
|
||||
maxwal = int(0)
|
||||
minseqwal = int(0)
|
||||
maxseqwal = int(0)
|
||||
|
||||
for file in s3bucket.objects.all():
|
||||
path = file.key
|
||||
match = re.search('nonreldata/nonrel_([0-9A-F]+).tar', path)
|
||||
if match:
|
||||
walpos = int(match.group(1), 16)
|
||||
nonrelimages.append(walpos_str(walpos))
|
||||
|
||||
match = re.search('nonreldata/nonrel_([0-9A-F]+)-([0-9A-F]+)', path)
|
||||
if match:
|
||||
endwal = int(match.group(2), 16)
|
||||
if endwal > maxwal:
|
||||
maxwal = endwal
|
||||
|
||||
match = re.search('walarchive/([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', path)
|
||||
if match:
|
||||
tli = int(match.group(1), 16)
|
||||
logno = int(match.group(2), 16)
|
||||
segno = int(match.group(3), 16)
|
||||
# FIXME: this assumes default 16 MB wal segment size
|
||||
logsegno = logno * (0x100000000 / (16*1024*1024)) + segno
|
||||
|
||||
seqwal = int((logsegno + 1) * (16*1024*1024))
|
||||
|
||||
if seqwal > maxseqwal:
|
||||
maxseqwal = seqwal;
|
||||
if minseqwal == 0 or seqwal < minseqwal:
|
||||
minseqwal = seqwal;
|
||||
|
||||
return {
|
||||
'nonrelimages': nonrelimages,
|
||||
'minwal': walpos_str(minwal),
|
||||
'maxwal': walpos_str(maxwal),
|
||||
'minseqwal': walpos_str(minseqwal),
|
||||
'maxseqwal': walpos_str(maxseqwal)
|
||||
}
|
||||
|
||||
def print_cmd_result(cmd_result):
|
||||
return print_cmd_result_ex(cmd_result.args, cmd_result.returncode, cmd_result.stdout)
|
||||
|
||||
def print_cmd_result_ex(cmd, returncode, stdout):
|
||||
res = ''
|
||||
res += 'ran command:\n' + str(cmd) + '\n'
|
||||
res += 'It returned code ' + str(returncode) + '\n'
|
||||
res += '\n'
|
||||
res += 'stdout/stderr:\n'
|
||||
res += stdout
|
||||
|
||||
return res
|
||||
|
||||
@app.route('/api/init_primary', methods=['GET', 'POST'])
|
||||
def init_primary():
|
||||
|
||||
initdb_result = run("initdb -D pgdatadirs/primary --username=zenith --pwfile=pg-password.txt", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
if initdb_result.returncode != 0:
|
||||
return print_cmd_result(initdb_result)
|
||||
|
||||
# Append archive_mode and archive_command and port to postgresql.conf
|
||||
f=open("pgdatadirs/primary/postgresql.conf", "a+")
|
||||
f.write("listen_addresses='*'\n")
|
||||
f.write("archive_mode=on\n")
|
||||
f.write("archive_command='zenith_push --archive-wal-path=%p --archive-wal-fname=%f'\n")
|
||||
f.write("ssl=on\n")
|
||||
f.close()
|
||||
|
||||
f=open("pgdatadirs/primary/pg_hba.conf", "a+")
|
||||
f.write("# allow SSL connections with password from anywhere\n")
|
||||
f.write("hostssl all all 0.0.0.0/0 md5\n")
|
||||
f.write("hostssl all all ::0/0 md5\n")
|
||||
f.close()
|
||||
|
||||
shutil.copyfile("server.crt", "pgdatadirs/primary/server.crt")
|
||||
shutil.copyfile("server.key", "pgdatadirs/primary/server.key")
|
||||
os.chmod("pgdatadirs/primary/server.key", 0o0600)
|
||||
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
|
||||
responsestr = print_cmd_result(initdb_result) + '\n'
|
||||
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/zenith_push', methods=['GET', 'POST'])
|
||||
def zenith_push():
|
||||
# Stop the primary if it's running
|
||||
stop_result = run(args=["pg_ctl", "stop", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
|
||||
# Call zenith_push
|
||||
push_result = run("zenith_push -D pgdatadirs/primary", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
# Restart the primary
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
|
||||
responsestr = print_cmd_result(stop_result) + '\n'
|
||||
responsestr += print_cmd_result(push_result) + '\n'
|
||||
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout) + '\n'
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/create_standby', methods=['GET', 'POST'])
|
||||
def create_standby():
|
||||
|
||||
walpos = request.form.get('walpos')
|
||||
if not walpos:
|
||||
return 'no walpos'
|
||||
|
||||
dirs = os.listdir("pgdatadirs")
|
||||
|
||||
last_port = 5432
|
||||
|
||||
for dirname in dirs:
|
||||
|
||||
standby_match = re.search('standby_([0-9]+)', dirname)
|
||||
if standby_match:
|
||||
port = int(standby_match.group(1))
|
||||
if port > last_port:
|
||||
last_port = port
|
||||
|
||||
standby_port = last_port + 1
|
||||
|
||||
standby_dir = "pgdatadirs/standby_" + str(standby_port)
|
||||
|
||||
# Call zenith_restore
|
||||
restore_result = run(["zenith_restore", "--end=" + walpos, "-D", standby_dir], stdout=PIPE, stderr=STDOUT, encoding='latin1')
|
||||
responsestr = print_cmd_result(restore_result)
|
||||
|
||||
if restore_result.returncode == 0:
|
||||
# Append hot_standby and port to postgresql.conf
|
||||
f=open(standby_dir + "/postgresql.conf", "a+")
|
||||
f.write("hot_standby=on\n")
|
||||
f.write("port=" + str(standby_port) + "\n")
|
||||
f.close()
|
||||
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", standby_dir, "-l", standby_dir + "/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
responsestr += '\n\n' + print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/destroy_server', methods=['GET', 'POST'])
|
||||
def destroy_primary():
|
||||
|
||||
datadir = request.form.get('datadir')
|
||||
|
||||
# Check that the datadir parameter doesn't contain anything funny.
|
||||
if not re.match("^[A-Za-z0-9_-]+$", datadir):
|
||||
raise Exception('invalid datadir: ' + datadir)
|
||||
|
||||
# Stop the server if it's running
|
||||
stop_result = run(args=["pg_ctl", "stop", "-m", "immediate", "-D", "pgdatadirs/" + datadir], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
|
||||
shutil.rmtree('pgdatadirs/' + datadir, ignore_errors=True)
|
||||
|
||||
responsestr = print_cmd_result(stop_result) + '\n'
|
||||
responsestr += 'Deleted datadir ' + datadir + '.\n'
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/restore_primary', methods=['GET', 'POST'])
|
||||
def restore_primary():
|
||||
|
||||
# Call zenith_restore
|
||||
restore_result = run(["zenith_restore", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, encoding='latin1')
|
||||
responsestr = print_cmd_result(restore_result)
|
||||
|
||||
# Append restore_command to postgresql.conf, so that it can find the last raw WAL segments
|
||||
f=open("pgdatadirs/primary/postgresql.conf", "a+")
|
||||
f.write("listen_addresses='*'\n")
|
||||
f.write("restore_command='zenith_restore --archive-wal-path=%p --archive-wal-fname=%f'\n")
|
||||
f.write("ssl=on\n")
|
||||
f.close()
|
||||
|
||||
if restore_result.returncode == 0:
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/slicedice', methods=['GET', 'POST'])
|
||||
def run_slicedice():
|
||||
result = run("zenith_slicedice", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
responsestr = print_cmd_result(result)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/reset_demo', methods=['POST'])
|
||||
def reset_all():
|
||||
result = run("pkill -9 postgres", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
dirs = os.listdir("pgdatadirs")
|
||||
for dirname in dirs:
|
||||
shutil.rmtree('pgdatadirs/' + dirname)
|
||||
|
||||
for file in s3bucket.objects.all():
|
||||
s3_client.delete_object(Bucket = BUCKET, Key = file.key)
|
||||
|
||||
responsestr = print_cmd_result(result) + '\n'
|
||||
responsestr += '''
|
||||
Deleted all Postgres datadirs.
|
||||
Deleted all files in object storage bucket.
|
||||
'''
|
||||
|
||||
return responsestr
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run()
|
||||
@@ -1,3 +0,0 @@
|
||||
module.exports = {
|
||||
presets: ["@babel/preset-env", "@babel/preset-react"],
|
||||
};
|
||||
@@ -1,67 +0,0 @@
|
||||
Mock implementation of a management console.
|
||||
|
||||
This isn't very different from a "normal" PostgreSQL installation with
|
||||
a base backup and WAL archive. The main user-visible difference is
|
||||
that when you create a standby server, we don't restore the whole data
|
||||
directory, but only the "non-relation" files. Relation files are
|
||||
restored on demand, when they're accessed the first time. That makes
|
||||
the "create standby" operation is very fast, but with some delay when
|
||||
you connect and start running queries instead. Most visible if you
|
||||
have a large database. (However, see note below about large databases)
|
||||
|
||||
Note: lots of things are broken/unsafe. Things will fail if a table is
|
||||
larger than 1 GB. Or if there are more than 1000 files in the cloud
|
||||
bucket.
|
||||
|
||||
How to use this demo:
|
||||
|
||||
1. If there are any leftovers from previous runs, reset by clicking
|
||||
the RESET DEMO button. This kills and deletes all Postgres servers,
|
||||
and empties the cloud storage bucket
|
||||
|
||||
2. Create primary server by clicking on the "Init primary" button
|
||||
|
||||
3. Push a base image of the primary to cloud storage, by clicking the
|
||||
"push base image" button. (This takes about 30 seconds, be
|
||||
patient)
|
||||
|
||||
4. Connect to primary with psql, and create a test table with a little data.
|
||||
|
||||
psql postgres -p5432 -U zenith -h<host>
|
||||
|
||||
create table mytable (i int4);
|
||||
|
||||
insert into mytable values (1);
|
||||
select pg_switch_wal();
|
||||
|
||||
The Postgres password is the same as for the management console.
|
||||
|
||||
3. Now that there's a new WAL segment in the arhive, we can "slice &
|
||||
dice" it. Click on the "Slice & dice button".
|
||||
|
||||
4. Perform more updates on the primary, to generate more WAL.
|
||||
|
||||
insert into mytable values (2); select pg_switch_wal();
|
||||
insert into mytable values (3); select pg_switch_wal();
|
||||
insert into mytable values (4); select pg_switch_wal();
|
||||
insert into mytable values (5); select pg_switch_wal();
|
||||
|
||||
5. Slice & Dice the WAL again
|
||||
|
||||
6. Now you can create read-only standby servers at any point in the
|
||||
WAL. Type a WAL position in the text box (or use the slider), and
|
||||
click "Create new standby". The first standby is created at port 5433,
|
||||
the second at port 5434, and so forth.
|
||||
|
||||
7. Connect to the standby with "psql -p 5433". Note that it takes a
|
||||
few seconds until the connection is established. That's because the
|
||||
standby has to restore the basic system catalogs, like pg_database and
|
||||
pg_authid from the backup. After connecting, you can do "\d" to list
|
||||
tables, this will also take a few seconds, as more catalog tables are
|
||||
restored from backup. Subsequent commands will be faster.
|
||||
|
||||
Run queries in the standby:
|
||||
|
||||
select * from mytable;
|
||||
|
||||
the result depends on the LSN that you picked when you created the server.
|
||||
@@ -1,463 +0,0 @@
|
||||
import React, { useState, useEffect } from 'react';
|
||||
import ReactDOM from 'react-dom';
|
||||
import Loader from "react-loader-spinner";
|
||||
import { Router, Route, Link, IndexRoute, hashHistory, browserHistory } from 'react-router';
|
||||
|
||||
function ServerStatus(props) {
|
||||
const datadir = props.server.datadir;
|
||||
const status = props.server.status;
|
||||
const port = props.server.port;
|
||||
|
||||
return (
|
||||
<div>
|
||||
<h2>{ datadir == 'primary' ? 'Primary' : datadir }</h2>
|
||||
status: <div className='status'>{status}</div><br/>
|
||||
to connect: <span className='shellcommand'>psql -h { window.location.hostname } -p { port } -U zenith postgres</span><br/>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function StandbyList(props) {
|
||||
const bucketSummary = props.bucketSummary;
|
||||
const standbys = props.standbys;
|
||||
const maxwalpos = bucketSummary.maxwal ? walpos_to_int(bucketSummary.maxwal) : 0;
|
||||
|
||||
const [walposInput, setWalposInput] = useState({ src: 'text', value: '0/0'});
|
||||
|
||||
// find earliest base image
|
||||
const minwalpos = bucketSummary.nonrelimages ? bucketSummary.nonrelimages.reduce((minpos, imgpos_str, index, array) => {
|
||||
const imgpos = walpos_to_int(imgpos_str);
|
||||
return (minpos == 0 || imgpos < minpos) ? imgpos : minpos;
|
||||
}, 0) : 0;
|
||||
|
||||
const can_create_standby = minwalpos > 0 && maxwalpos > 0 && maxwalpos >= minwalpos;
|
||||
var walpos_valid = true;
|
||||
|
||||
function create_standby() {
|
||||
const formdata = new FormData();
|
||||
formdata.append("walpos", walposStr);
|
||||
|
||||
props.startOperation('Creating new standby at ' + walposStr + '...',
|
||||
fetch("/api/create_standby", { method: 'POST', body: formdata }));
|
||||
}
|
||||
|
||||
function destroy_standby(datadir) {
|
||||
const formdata = new FormData();
|
||||
formdata.append("datadir", datadir);
|
||||
props.startOperation('Destroying ' + datadir + '...',
|
||||
fetch("/api/destroy_server", { method: 'POST', body: formdata }));
|
||||
}
|
||||
|
||||
const handleSliderChange = (event) => {
|
||||
setWalposInput({ src: 'slider', value: event.target.value });
|
||||
}
|
||||
|
||||
const handleWalposChange = (event) => {
|
||||
setWalposInput({ src: 'text', value: event.target.value });
|
||||
}
|
||||
|
||||
var sliderValue;
|
||||
var walposStr;
|
||||
if (walposInput.src == 'text')
|
||||
{
|
||||
const walpos = walpos_to_int(walposInput.value);
|
||||
|
||||
if (walpos >= minwalpos && walpos <= maxwalpos)
|
||||
walpos_valid = true;
|
||||
else
|
||||
walpos_valid = false;
|
||||
|
||||
sliderValue = Math.round((walpos - minwalpos) / (maxwalpos - minwalpos) * 100);
|
||||
walposStr = walposInput.value;
|
||||
}
|
||||
else
|
||||
{
|
||||
const slider = walposInput.value;
|
||||
const new_walpos = minwalpos + slider / 100 * (maxwalpos - minwalpos);
|
||||
|
||||
console.log('minwalpos: '+ minwalpos);
|
||||
console.log('maxwalpos: '+ maxwalpos);
|
||||
|
||||
walposStr = int_to_walpos(Math.round(new_walpos));
|
||||
walpos_valid = true;
|
||||
console.log(walposStr);
|
||||
}
|
||||
|
||||
var standbystatus = ''
|
||||
if (standbys)
|
||||
{
|
||||
standbystatus =
|
||||
<div>
|
||||
{
|
||||
standbys.length > 0 ?
|
||||
standbys.map((server) =>
|
||||
<>
|
||||
<ServerStatus key={ 'status_' + server.datadir} server={server}/>
|
||||
<button key={ 'destroy_' + server.datadir} onClick={e => destroy_standby(server.datadir)}>Destroy standby</button>
|
||||
</>
|
||||
) : "no standby servers"
|
||||
}
|
||||
</div>
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<h2>Standbys</h2>
|
||||
<button onClick={create_standby} disabled={!can_create_standby || !walpos_valid}>Create new Standby</button> at LSN
|
||||
<input type="text" id="walpos_input" value={ walposStr } onChange={handleWalposChange} disabled={!can_create_standby}/>
|
||||
<input type="range" id="walpos_slider" min="0" max="100" steps="1" value={sliderValue} onChange={handleSliderChange} disabled={!can_create_standby}/>
|
||||
<br/>
|
||||
{ standbystatus }
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ServerList(props) {
|
||||
const primary = props.serverStatus ? props.serverStatus.primary : null;
|
||||
const standbys = props.serverStatus ? props.serverStatus.standbys : [];
|
||||
const bucketSummary = props.bucketSummary;
|
||||
|
||||
var primarystatus = '';
|
||||
|
||||
function destroy_primary() {
|
||||
const formdata = new FormData();
|
||||
formdata.append("datadir", 'primary');
|
||||
props.startOperation('Destroying primary...',
|
||||
fetch("/api/destroy_server", { method: 'POST', body: formdata }));
|
||||
}
|
||||
|
||||
function restore_primary() {
|
||||
props.startOperation('Restoring primary...',
|
||||
fetch("/api/restore_primary", { method: 'POST' }));
|
||||
}
|
||||
|
||||
if (primary)
|
||||
{
|
||||
primarystatus =
|
||||
<div>
|
||||
<ServerStatus server={primary}/>
|
||||
<button onClick={destroy_primary}>Destroy primary</button>
|
||||
</div>
|
||||
}
|
||||
else
|
||||
{
|
||||
primarystatus =
|
||||
<div>
|
||||
no primary server<br/>
|
||||
<button onClick={restore_primary}>Restore primary</button>
|
||||
</div>
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
{ primarystatus }
|
||||
<StandbyList standbys={standbys} startOperation={props.startOperation} bucketSummary={props.bucketSummary}/>
|
||||
<p className="todo">
|
||||
Should we list the WAL safekeeper nodes here? Or are they part of the Storage? Or not visible to users at all?
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
function BucketSummary(props) {
|
||||
const bucketSummary = props.bucketSummary;
|
||||
const startOperation = props.startOperation;
|
||||
|
||||
function slicedice() {
|
||||
startOperation('Slicing sequential WAL to per-relation WAL...',
|
||||
fetch("/api/slicedice", { method: 'POST' }));
|
||||
}
|
||||
|
||||
if (!bucketSummary.nonrelimages)
|
||||
{
|
||||
return <>loading...</>
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<div>Base images at following WAL positions:
|
||||
<ul>
|
||||
{bucketSummary.nonrelimages.map((img) => (
|
||||
<li key={img}>{img}</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
Sliced WAL is available up to { bucketSummary.maxwal }<br/>
|
||||
Raw WAL is available up to { bucketSummary.maxseqwal }<br/>
|
||||
|
||||
<br/>
|
||||
<button onClick={slicedice}>Slice & Dice WAL</button>
|
||||
<p className="todo">
|
||||
Currently, the slicing or "sharding" of the WAL needs to be triggered manually, by clicking the above button.
|
||||
<br/>
|
||||
TODO: make it a continuous process that runs in the WAL safekeepers, or in the Page Servers, or as a standalone service.
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ProgressIndicator()
|
||||
{
|
||||
return (
|
||||
<div>
|
||||
<Loader
|
||||
type="Puff"
|
||||
color="#00BFFF"
|
||||
height={100}
|
||||
width={100}
|
||||
/>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function walpos_to_int(walpos)
|
||||
{
|
||||
const [hi, lo] = walpos.split('/');
|
||||
|
||||
return parseInt(hi, 16) + parseInt(lo, 16);
|
||||
}
|
||||
|
||||
function int_to_walpos(x)
|
||||
{
|
||||
console.log('converting ' + x);
|
||||
return (Math.floor((x / 0x100000000)).toString(16) + '/' + (x % 0x100000000).toString(16)).toUpperCase();
|
||||
}
|
||||
|
||||
function OperationStatus(props) {
|
||||
const lastOperation = props.lastOperation;
|
||||
const inProgress = props.inProgress;
|
||||
const operationResult = props.operationResult;
|
||||
|
||||
if (lastOperation)
|
||||
{
|
||||
return (
|
||||
<div><h2>Last operation:</h2>
|
||||
<div>{lastOperation} { (!inProgress && lastOperation) ? 'done!' : '' }</div>
|
||||
<div className='result'>
|
||||
{inProgress ? <ProgressIndicator/> : <pre>{operationResult}</pre>}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
else
|
||||
return '';
|
||||
}
|
||||
|
||||
function ActionButtons(props) {
|
||||
|
||||
const startOperation = props.startOperation;
|
||||
const bucketSummary = props.bucketSummary;
|
||||
|
||||
function reset_demo() {
|
||||
startOperation('resetting everything...',
|
||||
fetch("/api/reset_demo", { method: 'POST' }));
|
||||
}
|
||||
|
||||
function init_primary() {
|
||||
startOperation('Initializing new primary...',
|
||||
fetch("/api/init_primary", { method: 'POST' }));
|
||||
}
|
||||
|
||||
function zenith_push() {
|
||||
startOperation('Pushing new base image...',
|
||||
fetch("/api/zenith_push", { method: 'POST' }));
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<p className="todo">
|
||||
RESET DEMO deletes everything in the storage bucket, and stops and destroys all servers. This resets the whole demo environment to the initial state.
|
||||
</p>
|
||||
<button onClick={reset_demo}>RESET DEMO</button>
|
||||
<p className="todo">
|
||||
Init Primary runs initdb to create a new primary server. Click this after Resetting the demo.
|
||||
</p>
|
||||
|
||||
<button onClick={init_primary}>Init primary</button>
|
||||
|
||||
<p className="todo">
|
||||
Push Base Image stops the primary, copies the current state of the primary to the storage bucket as a new base backup, and restarts the primary.
|
||||
<br/>
|
||||
TODO: This should be handled by a continuous background process, probably running in the storage nodes. And without having to shut down the cluster, of course.
|
||||
</p>
|
||||
|
||||
<button onClick={zenith_push}>Push base image</button>
|
||||
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function Sidenav(props)
|
||||
{
|
||||
const toPage = (page) => (event) => {
|
||||
//event.preventDefault()
|
||||
props.switchPage(page);
|
||||
};
|
||||
return (
|
||||
<div>
|
||||
<h3 className="sidenav-item">Menu</h3>
|
||||
<a href="#servers" onClick={toPage('servers')} className="sidenav-item">Servers</a>
|
||||
<a href="#storage" onClick={toPage('storage')} className="sidenav-item">Storage</a>
|
||||
<a href="#snapshots" onClick={toPage('snapshots')} className="sidenav-item">Snapshots</a>
|
||||
<a href="#demo" onClick={toPage('demo')} className="sidenav-item">Demo</a>
|
||||
<a href="#import" onClick={toPage('import')} className="sidenav-item">Import / Export</a>
|
||||
<a href="#jobs" onClick={toPage('jobs')} className="sidenav-item">Jobs</a>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function App()
|
||||
{
|
||||
const [page, setPage] = useState('servers');
|
||||
const [serverStatus, setServerStatus] = useState({});
|
||||
const [bucketSummary, setBucketSummary] = useState({});
|
||||
const [lastOperation, setLastOperation] = useState('');
|
||||
const [inProgress, setInProgress] = useState('');
|
||||
const [operationResult, setOperationResult] = useState('');
|
||||
|
||||
useEffect(() => {
|
||||
reloadStatus();
|
||||
}, []);
|
||||
|
||||
function startOperation(operation, promise)
|
||||
{
|
||||
promise.then(result => result.text()).then(resultText => {
|
||||
operationFinished(resultText);
|
||||
});
|
||||
|
||||
setLastOperation(operation);
|
||||
setInProgress(true);
|
||||
setOperationResult('');
|
||||
}
|
||||
|
||||
function operationFinished(result)
|
||||
{
|
||||
setInProgress(false);
|
||||
setOperationResult(result);
|
||||
reloadStatus();
|
||||
}
|
||||
|
||||
function clearOperation()
|
||||
{
|
||||
setLastOperation('')
|
||||
setInProgress('');
|
||||
setOperationResult('');
|
||||
console.log("cleared");
|
||||
}
|
||||
|
||||
function reloadStatus()
|
||||
{
|
||||
fetch('/api/server_status').then(res => res.json()).then(data => {
|
||||
setServerStatus(data);
|
||||
});
|
||||
|
||||
fetch('/api/bucket_summary').then(res => res.json()).then(data => {
|
||||
setBucketSummary(data);
|
||||
});
|
||||
}
|
||||
|
||||
const content = () => {
|
||||
console.log(page);
|
||||
if (page === 'servers') {
|
||||
return (
|
||||
<>
|
||||
<h1>Server status</h1>
|
||||
<ServerList startOperation={ startOperation }
|
||||
serverStatus={ serverStatus }
|
||||
bucketSummary={ bucketSummary }/>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'storage') {
|
||||
return (
|
||||
<>
|
||||
<h1>Storage bucket status</h1>
|
||||
<BucketSummary startOperation={ startOperation }
|
||||
bucketSummary={ bucketSummary }/>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'snapshots') {
|
||||
return (
|
||||
<>
|
||||
<h1>Snapshots</h1>
|
||||
<p className="todo">
|
||||
In Zenith, snapshots are just specific points (LSNs) in the WAL history, with a label. A snapshot prevents garbage collecting old data that's still needed to reconstruct the database at that LSN.
|
||||
</p>
|
||||
<p className="todo">
|
||||
TODO:
|
||||
<ul>
|
||||
<li>List existing snapshots</li>
|
||||
<li>Create new snapshot manually, from current state or from a given LSN</li>
|
||||
<li>Drill into the WAL stream to see what have happened. Provide tools for e.g. finding point where a table was dropped</li>
|
||||
<li>Create snapshots automatically based on events in the WAL, like if you call pg_create_restore_point(() in the primary</li>
|
||||
<li>Launch new reader instance at a snapshot</li>
|
||||
<li>Export snapshot</li>
|
||||
<li>Rollback cluster to a snapshot</li>
|
||||
</ul>
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'demo') {
|
||||
return (
|
||||
<>
|
||||
<h1>Misc actions</h1>
|
||||
<ActionButtons startOperation={ startOperation }
|
||||
bucketSummary={ bucketSummary }/>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'import') {
|
||||
return (
|
||||
<>
|
||||
<h1>Import & Export tools</h1>
|
||||
<p className="TODO">TODO:
|
||||
<ul>
|
||||
<li>Initialize database from existing backup (pg_basebackup, WAL-G, pgbackrest)</li>
|
||||
<li>Initialize from a pg_dump or other SQL script</li>
|
||||
<li>Launch batch job to import data files from S3</li>
|
||||
<li>Launch batch job to export database with pg_dump to S3</li>
|
||||
</ul>
|
||||
These jobs can be run in against reader processing nodes. We can even
|
||||
spawn a new reader node dedicated to a job, and destry it when the job is done.
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'jobs') {
|
||||
return (
|
||||
<>
|
||||
<h1>Batch jobs</h1>
|
||||
<p className="TODO">TODO:
|
||||
<ul>
|
||||
<li>List running jobs launched from Import & Export tools</li>
|
||||
<li>List other batch jobs launched by the user</li>
|
||||
<li>Launch new batch jobs</li>
|
||||
</ul>
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function switchPage(page)
|
||||
{
|
||||
console.log("topage " + page);
|
||||
setPage(page)
|
||||
clearOperation();
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="row">
|
||||
<div className="sidenav">
|
||||
<Sidenav switchPage={switchPage} className="column"/>
|
||||
</div>
|
||||
<div className="column">
|
||||
<div>
|
||||
{ content() }
|
||||
</div>
|
||||
<OperationStatus lastOperation={ lastOperation }
|
||||
inProgress = { inProgress }
|
||||
operationResult = { operationResult }/>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
ReactDOM.render(<App/>, document.getElementById('reactApp'));
|
||||
@@ -1,105 +0,0 @@
|
||||
import React, { useState, useEffect } from 'react';
|
||||
import ReactDOM from 'react-dom';
|
||||
import Loader from "react-loader-spinner";
|
||||
|
||||
function walpos_to_int(walpos)
|
||||
{
|
||||
const [hi, lo] = walpos.split('/');
|
||||
|
||||
return parseInt(hi, 16) + parseInt(lo, 16);
|
||||
}
|
||||
|
||||
const palette = [
|
||||
"#003f5c",
|
||||
"#2f4b7c",
|
||||
"#665191",
|
||||
"#a05195",
|
||||
"#d45087",
|
||||
"#f95d6a",
|
||||
"#ff7c43",
|
||||
"#ffa600"];
|
||||
|
||||
function WalRecord(props)
|
||||
{
|
||||
const firstwalpos = props.firstwalpos;
|
||||
const endwalpos = props.endwalpos;
|
||||
const record = props.record;
|
||||
const index = props.index;
|
||||
const xidmap = props.xidmap;
|
||||
|
||||
const startpos = walpos_to_int(record.start)
|
||||
const endpos = walpos_to_int(record.end)
|
||||
|
||||
const scale = 1000 / (16*1024*1024)
|
||||
const startx = (startpos - firstwalpos) * scale;
|
||||
const endx = (endpos - firstwalpos) * scale;
|
||||
|
||||
const xidindex = xidmap[record.xid];
|
||||
const color = palette[index % palette.length];
|
||||
|
||||
const y = 5 + (xidindex) * 20 + (index % 2) * 2;
|
||||
|
||||
return (
|
||||
<line x1={ startx } y1={y} x2={endx} y2={y} stroke={ color } strokeWidth="5">
|
||||
<title>
|
||||
start: { record.start } end: { record.end }
|
||||
</title>
|
||||
</line>
|
||||
)
|
||||
}
|
||||
|
||||
function WalFile(props)
|
||||
{
|
||||
const walContent = props.walContent;
|
||||
const firstwalpos = props.firstwalpos;
|
||||
const xidmap = props.xidmap;
|
||||
|
||||
return <svg width="1000" height="200">
|
||||
{
|
||||
walContent.records ?
|
||||
walContent.records.map((record, index) =>
|
||||
<WalRecord key={record.start} firstwalpos={firstwalpos} record={record} index={index} xidmap={xidmap}/>
|
||||
) : "no records"
|
||||
}
|
||||
</svg>
|
||||
}
|
||||
|
||||
function WalDumpApp()
|
||||
{
|
||||
const [walContent, setWalContent] = useState({});
|
||||
|
||||
const filename = '00000001000000000000000C';
|
||||
|
||||
useEffect(() => {
|
||||
fetch('/fetch_wal?filename='+filename).then(res => res.json()).then(data => {
|
||||
setWalContent(data);
|
||||
});
|
||||
}, []);
|
||||
|
||||
var firstwalpos = 0;
|
||||
var endwalpos = 0;
|
||||
var numxids = 0;
|
||||
var xidmap = {};
|
||||
if (walContent.records && walContent.records.length > 0)
|
||||
{
|
||||
firstwalpos = walpos_to_int(walContent.records[0].start);
|
||||
endwalpos = firstwalpos + 16*1024*1024;
|
||||
|
||||
walContent.records.forEach(rec => {
|
||||
if (!xidmap[rec.xid])
|
||||
{
|
||||
xidmap[rec.xid] = ++numxids;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
<h2>{filename}</h2>
|
||||
<WalFile walContent={walContent} firstwalpos={firstwalpos} endwalpos={endwalpos} xidmap={xidmap}/>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
console.log('hey there');
|
||||
ReactDOM.render(<WalDumpApp/>, document.getElementById('waldump'));
|
||||
@@ -1,9 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# NOTE: You must set the following environment variables before running this:
|
||||
# BASIC_AUTH_PASSWORD - basic http auth password
|
||||
# S3_ACCESSKEY
|
||||
# S3_SECRET
|
||||
|
||||
|
||||
S3_ENDPOINT=https://storage.googleapis.com S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql-install/bin:$PATH flask run --host=0.0.0.0
|
||||
@@ -1,8 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# NOTE: You should set the BASIC_AUTH_PASSWORD environment variable before calling
|
||||
|
||||
# Launch S3 server
|
||||
(cd ms3 && python3 -m ms3.app --listen-address=localhost) &
|
||||
|
||||
FLASK_ENV=development S3_REGION=auto S3_ENDPOINT=http://localhost:9009 S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql.fsmfork/bin:$PATH flask run --host=0.0.0.0
|
||||
6144
mgmt-console/package-lock.json
generated
6144
mgmt-console/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -1,27 +0,0 @@
|
||||
{
|
||||
"name": "starter-kit",
|
||||
"version": "1.1.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1",
|
||||
"build": "webpack",
|
||||
"start": "python app.py"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"react": "^17.0.1",
|
||||
"react-dom": "^17.0.1",
|
||||
"react-loader-spinner": "^4.0.0",
|
||||
"react-router": "^5.2.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@babel/core": "^7.13.1",
|
||||
"@babel/preset-env": "^7.13.5",
|
||||
"@babel/preset-react": "^7.12.13",
|
||||
"babel-loader": "^8.2.2",
|
||||
"webpack": "^5.24.2",
|
||||
"webpack-cli": "^4.5.0"
|
||||
}
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
<head>
|
||||
|
||||
<style>
|
||||
.status {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.shellcommand {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.result {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
|
||||
.todo {font-style: italic;}
|
||||
|
||||
|
||||
h1 {color: blue;}
|
||||
|
||||
.column {
|
||||
float: left;
|
||||
width: 50%;
|
||||
padding: 10px;
|
||||
}
|
||||
/* Clear floats after the columns */
|
||||
.row:after {
|
||||
content: "";
|
||||
display: table;
|
||||
clear: both;
|
||||
}
|
||||
|
||||
.sidenav {
|
||||
float: left;
|
||||
width: 150px;
|
||||
padding: 10px;
|
||||
background-color: pink;
|
||||
}
|
||||
|
||||
.sidenav-item {
|
||||
padding:10px 0px;
|
||||
border:none;
|
||||
display:block;
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="reactApp"></div>
|
||||
|
||||
<!-- Attach React components -->
|
||||
<script type="text/javascript" src="{{ url_for('static', filename='app_bundle.js') }}"></script>
|
||||
</body>
|
||||
@@ -1,46 +0,0 @@
|
||||
<head>
|
||||
|
||||
<style>
|
||||
.status {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.shellcommand {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.result {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
padding: 10px;
|
||||
}
|
||||
h1 {color: blue;}
|
||||
p {color: red;}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.row {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
/* Create two equal columns that sits next to each other */
|
||||
.column1 {
|
||||
flex: 30%;
|
||||
padding: 10px;
|
||||
}
|
||||
.column2 {
|
||||
flex: 70%;
|
||||
padding: 10px;
|
||||
}
|
||||
</style>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="waldump"></div>
|
||||
|
||||
<!-- Attach React components -->
|
||||
<script type="text/javascript" src="{{ url_for('static', filename='waldump_bundle.js') }}"></script>
|
||||
</body>
|
||||
@@ -1,25 +0,0 @@
|
||||
#
|
||||
# This file contains work-in-progress code to visualize WAL contents.
|
||||
#
|
||||
# This is the API endpoint that calls a 'zenith_wal_to_json' executable,
|
||||
# which is a hacked version of pg_waldump that prints information about the
|
||||
# records in JSON format. The code in js/waldump.js displays it.
|
||||
#
|
||||
|
||||
import os
|
||||
import re
|
||||
from subprocess import PIPE, STDOUT, run, Popen
|
||||
|
||||
def fetch_wal(request, s3bucket):
|
||||
filename = request.args.get('filename')
|
||||
if not re.match("^[A-Za-z0-9_]+$", filename):
|
||||
raise Exception('invalid WAL filename: ' + filename)
|
||||
|
||||
# FIXME: this downloads the WAL file to current dir. Use a temp dir? Pipe?
|
||||
s3bucket.download_file('walarchive/' + filename, filename)
|
||||
|
||||
result = run("zenith_wal_to_json " + filename, stdout=PIPE, universal_newlines=True, shell=True)
|
||||
|
||||
os.unlink(filename);
|
||||
|
||||
return result.stdout
|
||||
@@ -1,27 +0,0 @@
|
||||
var webpack = require('webpack');
|
||||
module.exports = {
|
||||
entry: {
|
||||
app: './js/app.js',
|
||||
waldump: './js/waldump.js'
|
||||
},
|
||||
output: {
|
||||
filename: "[name]_bundle.js",
|
||||
path: __dirname + '/static'
|
||||
},
|
||||
module: {
|
||||
rules: [
|
||||
{
|
||||
test: /\.js?$/,
|
||||
exclude: /node_modules/,
|
||||
use: {
|
||||
loader: 'babel-loader',
|
||||
options: {
|
||||
presets: ['@babel/preset-env']
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
plugins: [
|
||||
]
|
||||
};
|
||||
@@ -1,179 +0,0 @@
|
||||
#zenith.py
|
||||
import click
|
||||
import testgres
|
||||
import os
|
||||
|
||||
from testgres import PostgresNode
|
||||
from tabulate import tabulate
|
||||
|
||||
zenith_base_dir = '/home/anastasia/zenith/basedir'
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Run the Zenith CLI."""
|
||||
|
||||
@click.group()
|
||||
def pg():
|
||||
"""Db operations
|
||||
|
||||
NOTE: 'database' here means one postgresql node
|
||||
"""
|
||||
|
||||
@click.command(name='create')
|
||||
@click.option('--name', required=True)
|
||||
@click.option('-s', '--storage-name', help='Name of the storage',
|
||||
default='zenith-local',
|
||||
show_default=True)
|
||||
@click.option('--snapshot', help='init from the snapshot. Snap is a name or URL')
|
||||
@click.option('--no-start', is_flag=True, help='Do not start created node',
|
||||
default=False, show_default=True)
|
||||
def pg_create(name, storage_name, snapshot, no_start):
|
||||
"""Initialize the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
# TODO skip init, instead of that link node with storage or upload it from snapshot
|
||||
node.init()
|
||||
if(no_start==False):
|
||||
node.start()
|
||||
|
||||
@click.command(name='start')
|
||||
@click.option('--name', required=True)
|
||||
@click.option('--snapshot')
|
||||
@click.option('--read-only', is_flag=True, help='Start read-only node', show_default=True)
|
||||
def pg_start(name, snapshot, read_only):
|
||||
"""Start the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
# TODO pass snapshot as a parameter
|
||||
node.start()
|
||||
|
||||
@click.command(name='stop')
|
||||
@click.option('--name', required=True)
|
||||
def pg_stop(name):
|
||||
"""Stop the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
node.stop()
|
||||
|
||||
@click.command(name='destroy')
|
||||
@click.option('--name', required=True)
|
||||
def pg_destroy(name):
|
||||
"""Drop the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
node.cleanup()
|
||||
|
||||
@click.command(name='list')
|
||||
def pg_list():
|
||||
"""List existing databases"""
|
||||
dirs = os.listdir(os.path.join(zenith_base_dir, 'pg'))
|
||||
path={}
|
||||
status={}
|
||||
data=[]
|
||||
|
||||
for dirname in dirs:
|
||||
path[dirname] = os.path.join(zenith_base_dir, 'pg', dirname)
|
||||
fname = os.path.join( path[dirname], 'data/postmaster.pid')
|
||||
try:
|
||||
f = open(fname,'r')
|
||||
status[dirname] = f.readlines()[-1]
|
||||
except OSError as err:
|
||||
status[dirname]='inactive'
|
||||
data.append([dirname , status[dirname], path[dirname]])
|
||||
|
||||
print(tabulate(data, headers=['Name', 'Status', 'Path']))
|
||||
|
||||
pg.add_command(pg_create)
|
||||
pg.add_command(pg_destroy)
|
||||
pg.add_command(pg_start)
|
||||
pg.add_command(pg_stop)
|
||||
pg.add_command(pg_list)
|
||||
|
||||
|
||||
|
||||
@click.group()
|
||||
def storage():
|
||||
"""Storage operations"""
|
||||
|
||||
@click.command(name='attach')
|
||||
@click.option('--name')
|
||||
def storage_attach(name):
|
||||
"""Attach the storage"""
|
||||
|
||||
@click.command(name='detach')
|
||||
@click.option('--name')
|
||||
@click.option('--force', is_flag=True, show_default=True)
|
||||
def storage_detach(name):
|
||||
"""Detach the storage"""
|
||||
|
||||
@click.command(name='list')
|
||||
def storage_list():
|
||||
"""List existing storages"""
|
||||
|
||||
storage.add_command(storage_attach)
|
||||
storage.add_command(storage_detach)
|
||||
storage.add_command(storage_list)
|
||||
|
||||
@click.group()
|
||||
def snapshot():
|
||||
"""Snapshot operations"""
|
||||
|
||||
@click.command(name='create')
|
||||
def snapshot_create():
|
||||
"""Create new snapshot"""
|
||||
|
||||
@click.command(name='destroy')
|
||||
def snapshot_destroy():
|
||||
"""Destroy the snapshot"""
|
||||
|
||||
@click.command(name='pull')
|
||||
def snapshot_pull():
|
||||
"""Pull remote snapshot"""
|
||||
|
||||
@click.command(name='push')
|
||||
def snapshot_push():
|
||||
"""Push snapshot to remote"""
|
||||
|
||||
@click.command(name='import')
|
||||
def snapshot_import():
|
||||
"""Convert given format to zenith snapshot"""
|
||||
|
||||
@click.command(name='export')
|
||||
def snapshot_export():
|
||||
"""Convert zenith snapshot to PostgreSQL compatible format"""
|
||||
|
||||
snapshot.add_command(snapshot_create)
|
||||
snapshot.add_command(snapshot_destroy)
|
||||
snapshot.add_command(snapshot_pull)
|
||||
snapshot.add_command(snapshot_push)
|
||||
snapshot.add_command(snapshot_import)
|
||||
snapshot.add_command(snapshot_export)
|
||||
|
||||
@click.group()
|
||||
def wal():
|
||||
"""WAL operations"""
|
||||
|
||||
@click.command()
|
||||
def wallist(name="list"):
|
||||
"""List WAL files"""
|
||||
|
||||
wal.add_command(wallist)
|
||||
|
||||
|
||||
@click.command()
|
||||
def console():
|
||||
"""Open web console"""
|
||||
|
||||
main.add_command(pg)
|
||||
main.add_command(storage)
|
||||
main.add_command(snapshot)
|
||||
main.add_command(wal)
|
||||
main.add_command(console)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
2373
pageserver/Cargo.lock
generated
2373
pageserver/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -8,12 +8,10 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
crossbeam-channel = "0.5.0"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3.13"
|
||||
lazy_static = "1.4.0"
|
||||
slog-stdlog = "4.1.0"
|
||||
@@ -26,11 +24,24 @@ clap = "2.33.0"
|
||||
termion = "1.5.6"
|
||||
tui = "0.14.0"
|
||||
daemonize = "0.4.1"
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", features = ["no-verify-ssl"] }
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
tokio = { version = "1.3.0", features = ["full"] }
|
||||
tokio-stream = { version = "0.1.4" }
|
||||
tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres-protocol = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
rocksdb = "0.16.0"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
walkdir = "2"
|
||||
thiserror = "1.0"
|
||||
hex = "0.4.3"
|
||||
tar = "0.4.33"
|
||||
parse_duration = "2.1.1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
fs_extra = "1.2.0"
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
//
|
||||
// Triggers postgres build if there is no postgres binary present at
|
||||
// 'REPO_ROOT/tmp_install/bin/postgres'.
|
||||
//
|
||||
// I can see a lot of disadvantages with such automatization and main
|
||||
// advantage here is ability to build everything and run integration tests
|
||||
// in a bare repo by running 'cargo test'.
|
||||
//
|
||||
// We can interceipt whether it is debug or release build and run
|
||||
// corresponding pg build. But it seems like an overkill for now.
|
||||
//
|
||||
// Problem #1 -- language server in my editor likes calling 'cargo build'
|
||||
// by himself. So if I delete tmp_install directory it would magically reappear
|
||||
// after some time. During this compilation 'cargo build' may whine about
|
||||
// "waiting for file lock on build directory".
|
||||
//
|
||||
// Problem #2 -- cargo build would run this only if something is changed in
|
||||
// the crate.
|
||||
//
|
||||
// And generally speaking postgres is not a build dependency for the pageserver,
|
||||
// just for integration tests. So let's not mix that. I'll leave this file in
|
||||
// place for some time just in case if anybody would start doing the same.
|
||||
//
|
||||
|
||||
// use std::path::Path;
|
||||
// use std::process::{Command};
|
||||
|
||||
fn main() {
|
||||
// // build some postgres if it is not done none yet
|
||||
// if !Path::new("../tmp_install/bin/postgres").exists() {
|
||||
// let make_res = Command::new("make")
|
||||
// .arg("postgres")
|
||||
// .env_clear()
|
||||
// .status()
|
||||
// .expect("failed to execute 'make postgres'");
|
||||
|
||||
// if !make_res.success() {
|
||||
// panic!("postgres build failed");
|
||||
// }
|
||||
// }
|
||||
}
|
||||
399
pageserver/src/basebackup.rs
Normal file
399
pageserver/src/basebackup.rs
Normal file
@@ -0,0 +1,399 @@
|
||||
use crate::ZTimelineId;
|
||||
use log::*;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use tar::{Builder, Header};
|
||||
use walkdir::WalkDir;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
|
||||
use crate::repository::{BufferTag, RelTag, Timeline};
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
fn new_tar_header(path: &str, size: u64) -> anyhow::Result<Header> {
|
||||
let mut header = Header::new_gnu();
|
||||
header.set_size(size);
|
||||
header.set_path(path)?;
|
||||
header.set_mode(0b110000000);
|
||||
header.set_mtime(
|
||||
SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
);
|
||||
header.set_cksum();
|
||||
Ok(header)
|
||||
}
|
||||
|
||||
//
|
||||
// Generate SRLU segment files from repository
|
||||
//
|
||||
fn add_slru_segments(
|
||||
ar: &mut Builder<&mut dyn Write>,
|
||||
timeline: &Arc<dyn Timeline>,
|
||||
path: &str,
|
||||
forknum: u8,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let rel = RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum,
|
||||
};
|
||||
let (first, last) = timeline.get_range(rel, lsn)?;
|
||||
const SEG_SIZE: usize =
|
||||
pg_constants::BLCKSZ as usize * pg_constants::SLRU_PAGES_PER_SEGMENT as usize;
|
||||
let mut seg_buf = [0u8; SEG_SIZE];
|
||||
let mut curr_segno: Option<u32> = None;
|
||||
for page in first..last {
|
||||
let tag = BufferTag { rel, blknum: page };
|
||||
let img = timeline.get_page_at_lsn(tag, lsn)?;
|
||||
// Zero length image indicates truncated segment: just skip it
|
||||
if img.len() != 0 {
|
||||
assert!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
|
||||
let segno = page / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if curr_segno.is_some() && curr_segno.unwrap() != segno {
|
||||
let segname = format!("{}/{:>04X}", path, curr_segno.unwrap());
|
||||
let header = new_tar_header(&segname, SEG_SIZE as u64)?;
|
||||
ar.append(&header, &seg_buf[..])?;
|
||||
seg_buf = [0u8; SEG_SIZE];
|
||||
}
|
||||
curr_segno = Some(segno);
|
||||
let offs_start = (page % pg_constants::SLRU_PAGES_PER_SEGMENT) as usize
|
||||
* pg_constants::BLCKSZ as usize;
|
||||
let offs_end = offs_start + pg_constants::BLCKSZ as usize;
|
||||
seg_buf[offs_start..offs_end].copy_from_slice(&img);
|
||||
}
|
||||
}
|
||||
if curr_segno.is_some() {
|
||||
let segname = format!("{}/{:>04X}", path, curr_segno.unwrap());
|
||||
let header = new_tar_header(&segname, SEG_SIZE as u64)?;
|
||||
ar.append(&header, &seg_buf[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Extract pg_filenode.map files from repository
|
||||
//
|
||||
fn add_relmap_files(
|
||||
ar: &mut Builder<&mut dyn Write>,
|
||||
timeline: &Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
snappath: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
for db in timeline.get_databases(lsn)?.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: *db,
|
||||
blknum: 0,
|
||||
};
|
||||
let img = timeline.get_page_at_lsn(tag, lsn)?;
|
||||
let path = if db.spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
String::from("global/pg_filenode.map")
|
||||
} else {
|
||||
// User defined tablespaces are not supported
|
||||
assert!(db.spcnode == pg_constants::DEFAULTTABLESPACE_OID);
|
||||
let src_path = format!("{}/base/1/PG_VERSION", snappath);
|
||||
let dst_path = format!("base/{}/PG_VERSION", db.dbnode);
|
||||
ar.append_path_with_name(&src_path, &dst_path)?;
|
||||
format!("base/{}/pg_filenode.map", db.dbnode)
|
||||
};
|
||||
assert!(img.len() == 512);
|
||||
let header = new_tar_header(&path, img.len() as u64)?;
|
||||
ar.append(&header, &img[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Extract twophase state files
|
||||
//
|
||||
fn add_twophase_files(
|
||||
ar: &mut Builder<&mut dyn Write>,
|
||||
timeline: &Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
for xid in timeline.get_twophase(lsn)?.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_TWOPHASE_FORKNUM,
|
||||
},
|
||||
blknum: *xid,
|
||||
};
|
||||
let img = timeline.get_page_at_lsn(tag, lsn)?;
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
let crc = crc32c::crc32c(&img[..]);
|
||||
buf.put_u32_le(crc);
|
||||
let path = format!("pg_twophase/{:>08X}", xid);
|
||||
let header = new_tar_header(&path, buf.len() as u64)?;
|
||||
ar.append(&header, &buf[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Add generated pg_control file
|
||||
//
|
||||
fn add_pgcontrol_file(
|
||||
ar: &mut Builder<&mut dyn Write>,
|
||||
timeline: &Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
if let Some(checkpoint_bytes) =
|
||||
timeline.get_page_image(BufferTag::fork(pg_constants::PG_CHECKPOINT_FORKNUM), Lsn(0))?
|
||||
{
|
||||
if let Some(pg_control_bytes) = timeline.get_page_image(
|
||||
BufferTag::fork(pg_constants::PG_CONTROLFILE_FORKNUM),
|
||||
Lsn(0),
|
||||
)? {
|
||||
let mut pg_control = postgres_ffi::decode_pg_control(pg_control_bytes)?;
|
||||
let mut checkpoint = postgres_ffi::decode_checkpoint(checkpoint_bytes)?;
|
||||
|
||||
checkpoint.redo = lsn.0;
|
||||
checkpoint.nextXid.value += 1;
|
||||
// TODO: When we restart master there are no active transaction and oldestXid is
|
||||
// equal to nextXid if there are no prepared transactions.
|
||||
// Let's ignore them for a while...
|
||||
checkpoint.oldestXid = checkpoint.nextXid.value as u32;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
let pg_control_bytes = postgres_ffi::encode_pg_control(pg_control);
|
||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||
ar.append(&header, &pg_control_bytes[..])?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Generate tarball with non-relational files from repository
|
||||
///
|
||||
pub fn send_tarball_at_lsn(
|
||||
write: &mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
timeline: &Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
snapshot_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut ar = Builder::new(write);
|
||||
|
||||
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshot_lsn.0);
|
||||
|
||||
debug!("sending tarball of snapshot in {}", snappath);
|
||||
for entry in WalkDir::new(&snappath) {
|
||||
let entry = entry?;
|
||||
let fullpath = entry.path();
|
||||
let relpath = entry.path().strip_prefix(&snappath).unwrap();
|
||||
|
||||
if relpath.to_str().unwrap() == "" {
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.file_type().is_dir() {
|
||||
trace!(
|
||||
"sending dir {} as {}",
|
||||
fullpath.display(),
|
||||
relpath.display()
|
||||
);
|
||||
ar.append_dir(relpath, fullpath)?;
|
||||
} else if entry.file_type().is_symlink() {
|
||||
error!("ignoring symlink in snapshot dir");
|
||||
} else if entry.file_type().is_file() {
|
||||
// Shared catalogs are exempt
|
||||
if relpath.starts_with("global/") {
|
||||
trace!("sending shared catalog {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else if !is_rel_file_path(relpath.to_str().unwrap()) {
|
||||
if entry.file_name() != "pg_filenode.map"
|
||||
&& entry.file_name() != "pg_control"
|
||||
&& !relpath.starts_with("pg_xact/")
|
||||
&& !relpath.starts_with("pg_multixact/")
|
||||
{
|
||||
trace!("sending {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
}
|
||||
} else {
|
||||
trace!("not sending {}", relpath.display());
|
||||
}
|
||||
} else {
|
||||
error!("unknown file type: {}", fullpath.display());
|
||||
}
|
||||
}
|
||||
|
||||
add_slru_segments(
|
||||
&mut ar,
|
||||
timeline,
|
||||
"pg_xact",
|
||||
pg_constants::PG_XACT_FORKNUM,
|
||||
lsn,
|
||||
)?;
|
||||
add_slru_segments(
|
||||
&mut ar,
|
||||
timeline,
|
||||
"pg_multixact/members",
|
||||
pg_constants::PG_MXACT_MEMBERS_FORKNUM,
|
||||
lsn,
|
||||
)?;
|
||||
add_slru_segments(
|
||||
&mut ar,
|
||||
timeline,
|
||||
"pg_multixact/offsets",
|
||||
pg_constants::PG_MXACT_OFFSETS_FORKNUM,
|
||||
lsn,
|
||||
)?;
|
||||
add_relmap_files(&mut ar, timeline, lsn, &snappath)?;
|
||||
add_twophase_files(&mut ar, timeline, lsn)?;
|
||||
add_pgcontrol_file(&mut ar, timeline, lsn)?;
|
||||
|
||||
ar.finish()?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Send a tarball containing a snapshot of all non-relation files in the
|
||||
/// PostgreSQL data directory, at given LSN
|
||||
///
|
||||
/// There must be a snapshot at the given LSN in the snapshots directory, we cannot
|
||||
/// reconstruct the state at an arbitrary LSN at the moment.
|
||||
///
|
||||
pub fn send_snapshot_tarball(
|
||||
write: &mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
snapshotlsn: Lsn,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut ar = Builder::new(write);
|
||||
|
||||
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn.0);
|
||||
let walpath = format!("timelines/{}/wal", timelineid);
|
||||
|
||||
debug!("sending tarball of snapshot in {}", snappath);
|
||||
//ar.append_dir_all("", &snappath)?;
|
||||
|
||||
for entry in WalkDir::new(&snappath) {
|
||||
let entry = entry?;
|
||||
let fullpath = entry.path();
|
||||
let relpath = entry.path().strip_prefix(&snappath).unwrap();
|
||||
|
||||
if relpath.to_str().unwrap() == "" {
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.file_type().is_dir() {
|
||||
trace!(
|
||||
"sending dir {} as {}",
|
||||
fullpath.display(),
|
||||
relpath.display()
|
||||
);
|
||||
ar.append_dir(relpath, fullpath)?;
|
||||
} else if entry.file_type().is_symlink() {
|
||||
error!("ignoring symlink in snapshot dir");
|
||||
} else if entry.file_type().is_file() {
|
||||
// Shared catalogs are exempt
|
||||
if relpath.starts_with("global/") {
|
||||
trace!("sending shared catalog {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else if !is_rel_file_path(relpath.to_str().unwrap()) {
|
||||
trace!("sending {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else {
|
||||
trace!("not sending {}", relpath.display());
|
||||
|
||||
// FIXME: For now, also send all the relation files.
|
||||
// This really shouldn't be necessary, and kind of
|
||||
// defeats the point of having a page server in the
|
||||
// first place. But it is useful at least when
|
||||
// debugging with the DEBUG_COMPARE_LOCAL option (see
|
||||
// vendor/postgres/src/backend/storage/smgr/pagestore_smgr.c)
|
||||
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
}
|
||||
} else {
|
||||
error!("unknown file type: {}", fullpath.display());
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Also send all the WAL. The compute node would only need
|
||||
// the WAL that applies to non-relation files, because the page
|
||||
// server handles all the relation files. But we don't have a
|
||||
// mechanism for separating relation and non-relation WAL at the
|
||||
// moment.
|
||||
for entry in std::fs::read_dir(&walpath)? {
|
||||
let entry = entry?;
|
||||
let fullpath = &entry.path();
|
||||
let relpath = fullpath.strip_prefix(&walpath).unwrap();
|
||||
|
||||
if !entry.path().is_file() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let archive_fname = relpath.to_str().unwrap();
|
||||
let archive_fname = archive_fname
|
||||
.strip_suffix(".partial")
|
||||
.unwrap_or(&archive_fname);
|
||||
let archive_path = "pg_wal/".to_owned() + archive_fname;
|
||||
ar.append_path_with_name(fullpath, archive_path)?;
|
||||
}
|
||||
|
||||
ar.finish()?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Parse a path, relative to the root of PostgreSQL data directory, as
|
||||
/// a PostgreSQL relation data file.
|
||||
///
|
||||
fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
/*
|
||||
* Relation data files can be in one of the following directories:
|
||||
*
|
||||
* global/
|
||||
* shared relations
|
||||
*
|
||||
* base/<db oid>/
|
||||
* regular relations, default tablespace
|
||||
*
|
||||
* pg_tblspc/<tblspc oid>/<tblspc version>/
|
||||
* within a non-default tablespace (the name of the directory
|
||||
* depends on version)
|
||||
*
|
||||
* And the relation data files themselves have a filename like:
|
||||
*
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
|
||||
|
||||
Ok(())
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split('/');
|
||||
let dbnode_str = s.next().ok_or(FilePathError::InvalidFileName)?;
|
||||
let _dbnode = dbnode_str.parse::<u32>()?;
|
||||
let fname = s.next().ok_or(FilePathError::InvalidFileName)?;
|
||||
if s.next().is_some() {
|
||||
return Err(FilePathError::InvalidFileName);
|
||||
};
|
||||
|
||||
let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
|
||||
|
||||
Ok(())
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
// TODO
|
||||
error!("tablespaces not implemented yet");
|
||||
Err(FilePathError::InvalidFileName)
|
||||
} else {
|
||||
Err(FilePathError::InvalidFileName)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_rel_file_path(path: &str) -> bool {
|
||||
parse_rel_file_path(path).is_ok()
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings};
|
||||
|
||||
pub mod pg;
|
||||
pub mod snapshot;
|
||||
pub mod storage;
|
||||
mod subcommand;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli_commands = subcommand::ClapCommands {
|
||||
commands: vec![
|
||||
Box::new(pg::PgCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("pg"),
|
||||
}),
|
||||
Box::new(storage::StorageCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("storage"),
|
||||
}),
|
||||
Box::new(snapshot::SnapshotCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("snapshot"),
|
||||
}),
|
||||
],
|
||||
};
|
||||
|
||||
let matches = App::new("zenith")
|
||||
.about("Zenith CLI")
|
||||
.version("1.0")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommands(cli_commands.generate())
|
||||
.get_matches();
|
||||
|
||||
if let Some(subcommand) = matches.subcommand_name() {
|
||||
println!("'git {}' was used", subcommand);
|
||||
}
|
||||
|
||||
match matches.subcommand() {
|
||||
("pg", Some(sub_args)) => cli_commands.commands[0].run(sub_args.clone())?,
|
||||
("storage", Some(sub_args)) => cli_commands.commands[1].run(sub_args.clone())?,
|
||||
("snapshot", Some(sub_args)) => cli_commands.commands[2].run(sub_args.clone())?,
|
||||
("", None) => println!("No subcommand"),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,105 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings, Arg};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct PgCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for PgCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith compute nodes")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list").about("List existing compute nodes"))
|
||||
.subcommand(
|
||||
App::new("create")
|
||||
.about(
|
||||
"Create (init) new data directory using given storage and start postgres",
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("storage")
|
||||
.short("s")
|
||||
.long("storage")
|
||||
.takes_value(true)
|
||||
.help("Name of the storage node to use"),
|
||||
)
|
||||
//TODO should it be just name of uploaded snapshot or some path?
|
||||
.arg(
|
||||
Arg::with_name("snapshot")
|
||||
.long("snapshot")
|
||||
.takes_value(true)
|
||||
.help("Name of the snapshot to use"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("nostart")
|
||||
.long("no-start")
|
||||
.takes_value(false)
|
||||
.help("Don't start postgres on the created node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("destroy")
|
||||
.about("Stop postgres and destroy node's data directory")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("start")
|
||||
.about("Start postgres on the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("replica")
|
||||
.long("replica")
|
||||
.takes_value(false)
|
||||
.help("Start the compute node as replica"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("stop")
|
||||
.about("Stop postgres on the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("show")
|
||||
.about("Show info about the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run PgCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings, Arg};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct SnapshotCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for SnapshotCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith snapshots")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list"))
|
||||
.subcommand(App::new("create").arg(Arg::with_name("pgdata").required(true)))
|
||||
.subcommand(App::new("destroy"))
|
||||
.subcommand(App::new("start"))
|
||||
.subcommand(App::new("stop"))
|
||||
.subcommand(App::new("show"))
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run SnapshotCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct StorageCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for StorageCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith storage nodes")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list"))
|
||||
.subcommand(App::new("attach"))
|
||||
.subcommand(App::new("detach"))
|
||||
.subcommand(App::new("show"))
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run StorageCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
use anyhow::Result;
|
||||
|
||||
/// All subcommands need to implement this interface.
|
||||
pub trait SubCommand {
|
||||
/// Generates the cli-config that Clap requires for the subcommand.
|
||||
fn gen_clap_command(&self) -> clap::App;
|
||||
|
||||
/// Runs the body of the subcommand.
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()>;
|
||||
}
|
||||
|
||||
/// A struct which holds a vector of heap-allocated `Box`es of trait objects all of which must
|
||||
/// implement the `SubCommand` trait, but other than that, can be of any type.
|
||||
pub struct ClapCommands {
|
||||
pub commands: Vec<Box<dyn SubCommand>>,
|
||||
}
|
||||
|
||||
impl ClapCommands {
|
||||
/// Generates a vector of `clap::Apps` that can be passed into clap's `.subcommands()` method in
|
||||
/// order to generate the full CLI.
|
||||
pub fn generate(&self) -> Vec<clap::App> {
|
||||
let mut v: Vec<clap::App> = Vec::new();
|
||||
|
||||
for command in self.commands.iter() {
|
||||
v.push(command.gen_clap_command());
|
||||
}
|
||||
v
|
||||
}
|
||||
}
|
||||
@@ -3,73 +3,114 @@
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use std::fs;
|
||||
use parse_duration::parse;
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::thread;
|
||||
use std::{fs::File, fs::OpenOptions, str::FromStr};
|
||||
use std::time::Duration;
|
||||
use std::{env, path::PathBuf};
|
||||
use std::{
|
||||
fs::{File, OpenOptions},
|
||||
net::TcpListener,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{App, Arg};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use slog;
|
||||
use slog::Drain;
|
||||
use slog_scope;
|
||||
use slog_stdlog;
|
||||
use slog::{Drain, FnValue};
|
||||
|
||||
use pageserver::page_service;
|
||||
use pageserver::restore_s3;
|
||||
use pageserver::tui;
|
||||
use pageserver::walreceiver;
|
||||
use pageserver::PageServerConf;
|
||||
use pageserver::{branches, page_cache, page_service, tui, PageServerConf};
|
||||
|
||||
fn main() -> Result<(), io::Error> {
|
||||
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
const DEFAULT_GC_PERIOD_SEC: u64 = 10;
|
||||
//const DEFAULT_GC_HORIZON: u64 = 1024 * 1024 * 1024;
|
||||
//const DEFAULT_GC_PERIOD_SEC: u64 = 600;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.arg(Arg::with_name("datadir")
|
||||
.short("D")
|
||||
.long("dir")
|
||||
.takes_value(true)
|
||||
.help("Path to the page server data directory"))
|
||||
.arg(Arg::with_name("wal_producer")
|
||||
.short("w")
|
||||
.long("wal-producer")
|
||||
.takes_value(true)
|
||||
.help("connect to the WAL sender (postgres or wal_acceptor) on connstr (default: 'host=127.0.0.1 port=65432 user=zenith')"))
|
||||
.arg(Arg::with_name("listen")
|
||||
.short("l")
|
||||
.long("listen")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"))
|
||||
.arg(Arg::with_name("interactive")
|
||||
.short("i")
|
||||
.long("interactive")
|
||||
.takes_value(false)
|
||||
.help("Interactive mode"))
|
||||
.arg(Arg::with_name("daemonize")
|
||||
.short("d")
|
||||
.long("daemonize")
|
||||
.takes_value(false)
|
||||
.help("Run in the background"))
|
||||
.arg(Arg::with_name("skip_recovery")
|
||||
.long("skip-recovery")
|
||||
.takes_value(false)
|
||||
.help("Skip S3 recovery procedy and start empty"))
|
||||
.arg(
|
||||
Arg::with_name("listen")
|
||||
.short("l")
|
||||
.long("listen")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("interactive")
|
||||
.short("i")
|
||||
.long("interactive")
|
||||
.takes_value(false)
|
||||
.help("Interactive mode"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("daemonize")
|
||||
.short("d")
|
||||
.long("daemonize")
|
||||
.takes_value(false)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("init")
|
||||
.long("init")
|
||||
.takes_value(false)
|
||||
.help("Initialize pageserver repo"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gc_horizon")
|
||||
.long("gc_horizon")
|
||||
.takes_value(true)
|
||||
.help("Distance from current LSN to perform all wal records cleanup"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gc_period")
|
||||
.long("gc_period")
|
||||
.takes_value(true)
|
||||
.help("Interval between garbage collector iterations"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("workdir")
|
||||
.short("D")
|
||||
.long("workdir")
|
||||
.takes_value(true)
|
||||
.help("Working directory for the pageserver"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
data_dir: PathBuf::from("./"),
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
wal_producer_connstr: None,
|
||||
listen_addr: "127.0.0.1:5430".parse().unwrap(),
|
||||
skip_recovery: false,
|
||||
let workdir = if let Some(workdir_arg) = arg_matches.value_of("workdir") {
|
||||
PathBuf::from(workdir_arg)
|
||||
} else if let Some(workdir_arg) = std::env::var_os("ZENITH_REPO_DIR") {
|
||||
PathBuf::from(workdir_arg.to_str().unwrap())
|
||||
} else {
|
||||
PathBuf::from(".zenith")
|
||||
};
|
||||
|
||||
if let Some(dir) = arg_matches.value_of("datadir") {
|
||||
conf.data_dir = PathBuf::from(dir);
|
||||
let pg_distrib_dir: PathBuf = {
|
||||
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
||||
postgres_bin.into()
|
||||
} else {
|
||||
let cwd = env::current_dir()?;
|
||||
cwd.join("tmp_install")
|
||||
}
|
||||
};
|
||||
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(DEFAULT_GC_PERIOD_SEC),
|
||||
listen_addr: "127.0.0.1:64000".parse().unwrap(),
|
||||
// we will change the current working directory to the repository below,
|
||||
// so always set 'workdir' to '.'
|
||||
workdir: PathBuf::from("."),
|
||||
pg_distrib_dir,
|
||||
};
|
||||
|
||||
if arg_matches.is_present("daemonize") {
|
||||
conf.daemonize = true;
|
||||
}
|
||||
@@ -79,70 +120,84 @@ fn main() -> Result<(), io::Error> {
|
||||
}
|
||||
|
||||
if conf.daemonize && conf.interactive {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"--daemonize is not allowed with --interactive: choose one",
|
||||
));
|
||||
}
|
||||
|
||||
if arg_matches.is_present("skip_recovery") {
|
||||
conf.skip_recovery = true;
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("wal_producer") {
|
||||
conf.wal_producer_connstr = Some(String::from_str(addr).unwrap());
|
||||
eprintln!("--daemonize is not allowed with --interactive: choose one");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen") {
|
||||
conf.listen_addr = addr.parse().unwrap();
|
||||
conf.listen_addr = addr.parse()?;
|
||||
}
|
||||
|
||||
if let Some(horizon) = arg_matches.value_of("gc_horizon") {
|
||||
conf.gc_horizon = horizon.parse()?;
|
||||
}
|
||||
|
||||
if let Some(period) = arg_matches.value_of("gc_period") {
|
||||
conf.gc_period = parse(period)?;
|
||||
}
|
||||
|
||||
// The configuration is all set up now. Turn it into a 'static
|
||||
// that can be freely stored in structs and passed across threads
|
||||
// as a ref.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// Create repo and exit if init was requested
|
||||
if arg_matches.is_present("init") {
|
||||
branches::init_repo(conf, &workdir)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Set CWD to workdir for non-daemon modes
|
||||
env::set_current_dir(&workdir)?;
|
||||
|
||||
start_pageserver(conf)
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: PageServerConf) -> Result<(), io::Error> {
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
let log_filename = "pageserver.log";
|
||||
// Don't open the same file for output multiple times;
|
||||
// the different fds could overwrite each other's output.
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
|
||||
// Initialize logger
|
||||
let _scope_guard = init_logging(&conf);
|
||||
let _log_guard = slog_stdlog::init().unwrap();
|
||||
let logger_file = log_file.try_clone().unwrap();
|
||||
let _scope_guard = init_logging(&conf, logger_file)?;
|
||||
let _log_guard = slog_stdlog::init()?;
|
||||
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
info!("standard logging redirected to slog");
|
||||
|
||||
let tui_thread: Option<thread::JoinHandle<()>>;
|
||||
if conf.interactive {
|
||||
let tui_thread = if conf.interactive {
|
||||
// Initialize the UI
|
||||
tui_thread = Some(
|
||||
Some(
|
||||
thread::Builder::new()
|
||||
.name("UI thread".into())
|
||||
.spawn(|| {
|
||||
let _ = tui::ui_main();
|
||||
})
|
||||
.unwrap(),
|
||||
);
|
||||
//threads.push(tui_thread);
|
||||
)
|
||||
} else {
|
||||
tui_thread = None;
|
||||
}
|
||||
None
|
||||
};
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fpritf's or backtraces.
|
||||
let stdout = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(conf.data_dir.join("pageserver.log"))
|
||||
.unwrap();
|
||||
let stderr = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(conf.data_dir.join("pageserver.log"))
|
||||
.unwrap();
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let stdout = log_file.try_clone().unwrap();
|
||||
let stderr = log_file;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file(conf.data_dir.join("pageserver.pid"))
|
||||
.working_directory(conf.data_dir.clone())
|
||||
.pid_file("pageserver.pid")
|
||||
.working_directory(".")
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
@@ -152,89 +207,62 @@ fn start_pageserver(conf: PageServerConf) -> Result<(), io::Error> {
|
||||
}
|
||||
}
|
||||
|
||||
let mut threads = Vec::new();
|
||||
// Check that we can bind to address before further initialization
|
||||
info!("Starting pageserver on {}", conf.listen_addr);
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_addr)?;
|
||||
|
||||
info!("starting...");
|
||||
// Initialize page cache, this will spawn walredo_thread
|
||||
page_cache::init(conf);
|
||||
|
||||
// Before opening up for connections, restore the latest base backup from S3.
|
||||
// (We don't persist anything to local disk at the moment, so we need to do
|
||||
// this at every startup)
|
||||
// TODO move it to a separate function
|
||||
if !conf.skip_recovery {
|
||||
restore_s3::restore_main(&conf);
|
||||
}
|
||||
|
||||
// Create directory for wal-redo datadirs
|
||||
match fs::create_dir(conf.data_dir.join("wal-redo")) {
|
||||
Ok(_) => {}
|
||||
Err(e) => match e.kind() {
|
||||
io::ErrorKind::AlreadyExists => {}
|
||||
_ => {
|
||||
panic!("Failed to create wal-redo data directory: {}", e);
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// Launch the WAL receiver thread if pageserver was started with --wal-producer
|
||||
// option. It will try to connect to the WAL safekeeper, and stream the WAL. If
|
||||
// the connection is lost, it will reconnect on its own. We just fire and forget
|
||||
// it here.
|
||||
//
|
||||
// All other wal receivers are started on demand by "callmemaybe" command
|
||||
// sent to pageserver.
|
||||
let conf_copy = conf.clone();
|
||||
if let Some(wal_producer) = conf.wal_producer_connstr {
|
||||
let conf = conf_copy.clone();
|
||||
let walreceiver_thread = thread::Builder::new()
|
||||
.name("static WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
walreceiver::thread_main(conf, &wal_producer);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(walreceiver_thread);
|
||||
}
|
||||
|
||||
// GetPage@LSN requests are served by another thread. (It uses async I/O,
|
||||
// but the code in page_service sets up it own thread pool for that)
|
||||
let conf = conf_copy.clone();
|
||||
let page_server_thread = thread::Builder::new()
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
// for each connection.
|
||||
let page_service_thread = thread::Builder::new()
|
||||
.name("Page Service thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
page_service::thread_main(conf);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(page_server_thread);
|
||||
.spawn(move || page_service::thread_main(conf, pageserver_listener))?;
|
||||
|
||||
if tui_thread.is_some() {
|
||||
if let Some(tui_thread) = tui_thread {
|
||||
// The TUI thread exits when the user asks to Quit.
|
||||
tui_thread.unwrap().join().unwrap();
|
||||
tui_thread.join().unwrap();
|
||||
} else {
|
||||
// In non-interactive mode, wait forever.
|
||||
for t in threads {
|
||||
t.join().unwrap()
|
||||
}
|
||||
page_service_thread
|
||||
.join()
|
||||
.expect("Page service thread has panicked")?
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_logging(conf: &PageServerConf) -> slog_scope::GlobalLoggerGuard {
|
||||
fn init_logging(
|
||||
conf: &PageServerConf,
|
||||
log_file: File,
|
||||
) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
|
||||
if conf.interactive {
|
||||
tui::init_logging()
|
||||
Ok(tui::init_logging())
|
||||
} else if conf.daemonize {
|
||||
let log = conf.data_dir.join("pageserver.log");
|
||||
let log_file = File::create(log).unwrap_or_else(|_| panic!("Could not create log file"));
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
if record.level().is_at_least(slog::Level::Info) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
slog_scope::set_global_logger(logger)
|
||||
let logger = slog::Logger::root(
|
||||
drain,
|
||||
slog::o!(
|
||||
"location" =>
|
||||
FnValue(move |record| {
|
||||
format!("{}, {}:{}",
|
||||
record.module(),
|
||||
record.file(),
|
||||
record.line()
|
||||
)
|
||||
}
|
||||
)
|
||||
),
|
||||
);
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
} else {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build().fuse();
|
||||
@@ -248,10 +276,10 @@ fn init_logging(conf: &PageServerConf) -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
slog_scope::set_global_logger(logger)
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
}
|
||||
}
|
||||
|
||||
409
pageserver/src/branches.rs
Normal file
409
pageserver/src/branches.rs
Normal file
@@ -0,0 +1,409 @@
|
||||
//
|
||||
// Branch management code
|
||||
//
|
||||
// TODO: move all paths construction to conf impl
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use fs::File;
|
||||
use fs_extra;
|
||||
use postgres_ffi::{pg_constants, xlog_utils};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
use std::io::{Read, Write};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fs, io,
|
||||
path::{Path, PathBuf},
|
||||
process::{Command, Stdio},
|
||||
str::FromStr,
|
||||
};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::{repository::Repository, PageServerConf, ZTimelineId};
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct BranchInfo {
|
||||
pub name: String,
|
||||
pub timeline_id: ZTimelineId,
|
||||
pub latest_valid_lsn: Option<Lsn>,
|
||||
pub ancestor_id: Option<String>,
|
||||
pub ancestor_lsn: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PointInTime {
|
||||
pub timelineid: ZTimelineId,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub fn init_repo(conf: &PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
fs::create_dir_all(repo_dir)
|
||||
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
|
||||
|
||||
env::set_current_dir(repo_dir)?;
|
||||
|
||||
fs::create_dir(std::path::Path::new("timelines"))?;
|
||||
fs::create_dir(std::path::Path::new("refs"))?;
|
||||
fs::create_dir(std::path::Path::new("refs").join("branches"))?;
|
||||
fs::create_dir(std::path::Path::new("refs").join("tags"))?;
|
||||
fs::create_dir(std::path::Path::new("wal-redo"))?;
|
||||
|
||||
println!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
// Create initial timeline
|
||||
let tli = create_timeline(conf, None)?;
|
||||
let timelinedir = conf.timeline_path(tli);
|
||||
println!("created initial timeline {}", tli);
|
||||
|
||||
// Run initdb
|
||||
//
|
||||
// We create the cluster temporarily in a "tmp" directory inside the repository,
|
||||
// and move it to the right location from there.
|
||||
let tmppath = std::path::Path::new("tmp");
|
||||
|
||||
print!("running initdb... ");
|
||||
io::stdout().flush()?;
|
||||
|
||||
let initdb_path = conf.pg_bin_dir().join("initdb");
|
||||
let initdb_otput = Command::new(initdb_path)
|
||||
.args(&["-D", tmppath.to_str().unwrap()])
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.with_context(|| "failed to execute initdb")?;
|
||||
if !initdb_otput.status.success() {
|
||||
anyhow::bail!("initdb failed");
|
||||
}
|
||||
println!("initdb succeeded");
|
||||
|
||||
// Read control file to extract the LSN and system id
|
||||
let controlfile_path = tmppath.join("global").join("pg_control");
|
||||
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
|
||||
// let systemid = controlfile.system_identifier;
|
||||
let lsn = controlfile.checkPoint;
|
||||
let lsnstr = format!("{:016X}", lsn);
|
||||
|
||||
// Move the initial WAL file
|
||||
fs::rename(
|
||||
tmppath.join("pg_wal").join("000000010000000000000001"),
|
||||
timelinedir
|
||||
.join("wal")
|
||||
.join("000000010000000000000001.partial"),
|
||||
)?;
|
||||
println!("moved initial WAL file");
|
||||
|
||||
// Remove pg_wal
|
||||
fs::remove_dir_all(tmppath.join("pg_wal"))?;
|
||||
|
||||
let target = timelinedir.join("snapshots").join(&lsnstr);
|
||||
fs::rename(tmppath, &target)?;
|
||||
|
||||
// Create 'main' branch to refer to the initial timeline
|
||||
let data = tli.to_string();
|
||||
fs::write(conf.branch_path("main"), data)?;
|
||||
println!("created main branch");
|
||||
|
||||
println!(
|
||||
"new zenith repository was created in {}",
|
||||
repo_dir.display()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_branches(
|
||||
conf: &PageServerConf,
|
||||
repository: &dyn Repository,
|
||||
) -> Result<Vec<BranchInfo>> {
|
||||
// Each branch has a corresponding record (text file) in the refs/branches
|
||||
// with timeline_id.
|
||||
let branches_dir = std::path::Path::new("refs").join("branches");
|
||||
|
||||
std::fs::read_dir(&branches_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res?;
|
||||
let name = dir_entry.file_name().to_str().unwrap().to_string();
|
||||
let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
|
||||
|
||||
let latest_valid_lsn = repository
|
||||
.get_timeline(timeline_id)
|
||||
.map(|timeline| timeline.get_last_valid_lsn())
|
||||
.ok();
|
||||
|
||||
let ancestor_path = conf.ancestor_path(timeline_id);
|
||||
let mut ancestor_id: Option<String> = None;
|
||||
let mut ancestor_lsn: Option<String> = None;
|
||||
|
||||
if ancestor_path.exists() {
|
||||
let ancestor = std::fs::read_to_string(ancestor_path)?;
|
||||
let mut strings = ancestor.split('@');
|
||||
|
||||
ancestor_id = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
ancestor_lsn = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(BranchInfo {
|
||||
name,
|
||||
timeline_id,
|
||||
latest_valid_lsn,
|
||||
ancestor_id,
|
||||
ancestor_lsn,
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub(crate) fn get_system_id(conf: &PageServerConf) -> Result<u64> {
|
||||
// let branches = get_branches();
|
||||
|
||||
let branches_dir = std::path::Path::new("refs").join("branches");
|
||||
let branches = std::fs::read_dir(&branches_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res?;
|
||||
let name = dir_entry.file_name().to_str().unwrap().to_string();
|
||||
let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
|
||||
Ok((name, timeline_id))
|
||||
})
|
||||
.collect::<Result<HashMap<String, ZTimelineId>>>()?;
|
||||
|
||||
let main_tli = branches
|
||||
.get("main")
|
||||
.ok_or_else(|| anyhow!("Branch main not found"))?;
|
||||
|
||||
let (_, main_snap_dir) = find_latest_snapshot(conf, *main_tli)?;
|
||||
let controlfile_path = main_snap_dir.join("global").join("pg_control");
|
||||
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
|
||||
Ok(controlfile.system_identifier)
|
||||
}
|
||||
|
||||
pub(crate) fn create_branch(
|
||||
conf: &PageServerConf,
|
||||
branchname: &str,
|
||||
startpoint_str: &str,
|
||||
) -> Result<BranchInfo> {
|
||||
if conf.branch_path(&branchname).exists() {
|
||||
anyhow::bail!("branch {} already exists", branchname);
|
||||
}
|
||||
|
||||
let mut startpoint = parse_point_in_time(conf, startpoint_str)?;
|
||||
|
||||
if startpoint.lsn == Lsn(0) {
|
||||
// Find end of WAL on the old timeline
|
||||
let end_of_wal = find_end_of_wal(conf, startpoint.timelineid)?;
|
||||
println!("branching at end of WAL: {}", end_of_wal);
|
||||
startpoint.lsn = end_of_wal;
|
||||
}
|
||||
|
||||
// create a new timeline for it
|
||||
let newtli = create_timeline(conf, Some(startpoint))?;
|
||||
let newtimelinedir = conf.timeline_path(newtli);
|
||||
|
||||
let data = newtli.to_string();
|
||||
fs::write(conf.branch_path(&branchname), data)?;
|
||||
|
||||
// Copy the latest snapshot (TODO: before the startpoint) and all WAL
|
||||
// TODO: be smarter and avoid the copying...
|
||||
let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(conf, startpoint.timelineid)?;
|
||||
let copy_opts = fs_extra::dir::CopyOptions::new();
|
||||
fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), ©_opts)?;
|
||||
|
||||
let oldtimelinedir = conf.timeline_path(startpoint.timelineid);
|
||||
copy_wal(
|
||||
&oldtimelinedir.join("wal"),
|
||||
&newtimelinedir.join("wal"),
|
||||
startpoint.lsn,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
)?;
|
||||
|
||||
Ok(BranchInfo {
|
||||
name: branchname.to_string(),
|
||||
timeline_id: newtli,
|
||||
latest_valid_lsn: Some(startpoint.lsn),
|
||||
ancestor_id: None,
|
||||
ancestor_lsn: None,
|
||||
})
|
||||
}
|
||||
|
||||
//
|
||||
// Parse user-given string that represents a point-in-time.
|
||||
//
|
||||
// We support multiple variants:
|
||||
//
|
||||
// Raw timeline id in hex, meaning the end of that timeline:
|
||||
// bc62e7d612d0e6fe8f99a6dd2f281f9d
|
||||
//
|
||||
// A specific LSN on a timeline:
|
||||
// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
|
||||
//
|
||||
// Same, with a human-friendly branch name:
|
||||
// main
|
||||
// main@2/15D3DD8
|
||||
//
|
||||
// Human-friendly tag name:
|
||||
// mytag
|
||||
//
|
||||
//
|
||||
fn parse_point_in_time(conf: &PageServerConf, s: &str) -> Result<PointInTime> {
|
||||
let mut strings = s.split('@');
|
||||
let name = strings.next().unwrap();
|
||||
|
||||
let lsn: Option<Lsn>;
|
||||
if let Some(lsnstr) = strings.next() {
|
||||
lsn = Some(
|
||||
Lsn::from_str(lsnstr).with_context(|| "invalid LSN in point-in-time specification")?,
|
||||
);
|
||||
} else {
|
||||
lsn = None
|
||||
}
|
||||
|
||||
// Check if it's a tag
|
||||
if lsn.is_none() {
|
||||
let tagpath = conf.tag_path(name);
|
||||
if tagpath.exists() {
|
||||
let pointstr = fs::read_to_string(tagpath)?;
|
||||
|
||||
return parse_point_in_time(conf, &pointstr);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if it's a branch
|
||||
// Check if it's branch @ LSN
|
||||
let branchpath = conf.branch_path(name);
|
||||
if branchpath.exists() {
|
||||
let pointstr = fs::read_to_string(branchpath)?;
|
||||
|
||||
let mut result = parse_point_in_time(conf, &pointstr)?;
|
||||
|
||||
result.lsn = lsn.unwrap_or(Lsn(0));
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
// Check if it's a timelineid
|
||||
// Check if it's timelineid @ LSN
|
||||
if let Ok(timelineid) = ZTimelineId::from_str(name) {
|
||||
let tlipath = conf.timeline_path(timelineid);
|
||||
if tlipath.exists() {
|
||||
return Ok(PointInTime {
|
||||
timelineid,
|
||||
lsn: lsn.unwrap_or(Lsn(0)),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
bail!("could not parse point-in-time {}", s);
|
||||
}
|
||||
|
||||
fn create_timeline(conf: &PageServerConf, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
|
||||
// Create initial timeline
|
||||
let mut tli_buf = [0u8; 16];
|
||||
rand::thread_rng().fill(&mut tli_buf);
|
||||
let timelineid = ZTimelineId::from(tli_buf);
|
||||
|
||||
let timelinedir = conf.timeline_path(timelineid);
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("snapshots"))?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
|
||||
fs::write(timelinedir.join("ancestor"), data)?;
|
||||
}
|
||||
|
||||
Ok(timelineid)
|
||||
}
|
||||
|
||||
///
|
||||
/// Copy all WAL segments from one directory to another, up to given LSN.
|
||||
///
|
||||
/// If the given LSN is in the middle of a segment, the last segment containing it
|
||||
/// is written out as .partial, and padded with zeros.
|
||||
///
|
||||
fn copy_wal(src_dir: &Path, dst_dir: &Path, upto: Lsn, wal_seg_size: usize) -> Result<()> {
|
||||
let last_segno = upto.segment_number(wal_seg_size);
|
||||
let last_segoff = upto.segment_offset(wal_seg_size);
|
||||
|
||||
for entry in fs::read_dir(src_dir).unwrap() {
|
||||
if let Ok(entry) = entry {
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
|
||||
// Check if the filename looks like an xlog file, or a .partial file.
|
||||
if !xlog_utils::IsXLogFileName(fname) && !xlog_utils::IsPartialXLogFileName(fname) {
|
||||
continue;
|
||||
}
|
||||
let (segno, _tli) = xlog_utils::XLogFromFileName(fname, wal_seg_size as usize);
|
||||
|
||||
let copylen;
|
||||
let mut dst_fname = PathBuf::from(fname);
|
||||
if segno > last_segno {
|
||||
// future segment, skip
|
||||
continue;
|
||||
} else if segno < last_segno {
|
||||
copylen = wal_seg_size;
|
||||
dst_fname.set_extension("");
|
||||
} else {
|
||||
copylen = last_segoff;
|
||||
dst_fname.set_extension("partial");
|
||||
}
|
||||
|
||||
let src_file = File::open(entry.path())?;
|
||||
let mut dst_file = File::create(dst_dir.join(&dst_fname))?;
|
||||
std::io::copy(&mut src_file.take(copylen as u64), &mut dst_file)?;
|
||||
|
||||
if copylen < wal_seg_size {
|
||||
std::io::copy(
|
||||
&mut std::io::repeat(0).take((wal_seg_size - copylen) as u64),
|
||||
&mut dst_file,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Find the end of valid WAL in a wal directory
|
||||
pub fn find_end_of_wal(conf: &PageServerConf, timeline: ZTimelineId) -> Result<Lsn> {
|
||||
let waldir = conf.timeline_path(timeline).join("wal");
|
||||
let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, pg_constants::WAL_SEGMENT_SIZE, true);
|
||||
Ok(Lsn(lsn))
|
||||
}
|
||||
|
||||
// Find the latest snapshot for a timeline
|
||||
fn find_latest_snapshot(conf: &PageServerConf, timeline: ZTimelineId) -> Result<(Lsn, PathBuf)> {
|
||||
let snapshotsdir = conf.snapshots_path(timeline);
|
||||
let paths = fs::read_dir(&snapshotsdir)?;
|
||||
let mut maxsnapshot = Lsn(0);
|
||||
let mut snapshotdir: Option<PathBuf> = None;
|
||||
for path in paths {
|
||||
let path = path?;
|
||||
let filename = path.file_name().to_str().unwrap().to_owned();
|
||||
if let Ok(lsn) = Lsn::from_hex(&filename) {
|
||||
maxsnapshot = std::cmp::max(lsn, maxsnapshot);
|
||||
snapshotdir = Some(path.path());
|
||||
}
|
||||
}
|
||||
if maxsnapshot == Lsn(0) {
|
||||
// TODO: check ancestor timeline
|
||||
anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
|
||||
}
|
||||
|
||||
Ok((maxsnapshot, snapshotdir.unwrap()))
|
||||
}
|
||||
@@ -1,218 +0,0 @@
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::io::SeekFrom;
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use log::*;
|
||||
|
||||
type XLogRecPtr = u64;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone)]
|
||||
/*
|
||||
* Body of CheckPoint XLOG records. This is declared here because we keep
|
||||
* a copy of the latest one in pg_control for possible disaster recovery.
|
||||
* Changing this struct requires a PG_CONTROL_VERSION bump.
|
||||
*/
|
||||
pub struct CheckPoint {
|
||||
pub redo: XLogRecPtr, /* next RecPtr available when we began to
|
||||
* create CheckPoint (i.e. REDO start point) */
|
||||
pub ThisTimeLineID: u32, /* current TLI */
|
||||
pub PrevTimeLineID: u32, /* previous TLI, if this record begins a new
|
||||
* timeline (equals ThisTimeLineID otherwise) */
|
||||
pub fullPageWrites: bool, /* current full_page_writes */
|
||||
pub nextXid: u64, /* next free transaction ID */
|
||||
pub nextOid: u32, /* next free OID */
|
||||
pub nextMulti: u32, /* next free MultiXactId */
|
||||
pub nextMultiOffset: u32, /* next free MultiXact offset */
|
||||
pub oldestXid: u32, /* cluster-wide minimum datfrozenxid */
|
||||
pub oldestXidDB: u32, /* database with minimum datfrozenxid */
|
||||
pub oldestMulti: u32, /* cluster-wide minimum datminmxid */
|
||||
pub oldestMultiDB: u32, /* database with minimum datminmxid */
|
||||
pub time: u64, /* time stamp of checkpoint */
|
||||
pub oldestCommitTsXid: u32, /* oldest Xid with valid commit
|
||||
* timestamp */
|
||||
pub newestCommitTsXid: u32, /* newest Xid with valid commit
|
||||
* timestamp */
|
||||
|
||||
/*
|
||||
* Oldest XID still running. This is only needed to initialize hot standby
|
||||
* mode from an online checkpoint, so we only bother calculating this for
|
||||
* online checkpoints and only when wal_level is replica. Otherwise it's
|
||||
* set to InvalidTransactionId.
|
||||
*/
|
||||
pub oldestActiveXid: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ControlFileDataZenith {
|
||||
pub system_identifier: u64,
|
||||
pg_control_version: u32, /* PG_CONTROL_VERSION */
|
||||
catalog_version_no: u32, /* see catversion.h */
|
||||
|
||||
state: i32, /* see enum above */
|
||||
time: i64, /* time stamp of last pg_control update */
|
||||
pub checkPoint: XLogRecPtr,
|
||||
checkPointCopy: CheckPoint, /* copy of last check point record */
|
||||
unloggedLSN: XLogRecPtr, /* current fake LSN value, for unlogged rels */
|
||||
minRecoveryPoint: XLogRecPtr,
|
||||
minRecoveryPointTLI: u32,
|
||||
backupStartPoint: XLogRecPtr,
|
||||
backupEndPoint: XLogRecPtr,
|
||||
backupEndRequired: bool,
|
||||
}
|
||||
|
||||
impl ControlFileDataZenith {
|
||||
pub fn new() -> ControlFileDataZenith {
|
||||
ControlFileDataZenith {
|
||||
system_identifier: 0,
|
||||
pg_control_version: 0,
|
||||
catalog_version_no: 0,
|
||||
state: 0,
|
||||
time: 0,
|
||||
checkPoint: 0,
|
||||
checkPointCopy: {
|
||||
CheckPoint {
|
||||
redo: 0,
|
||||
ThisTimeLineID: 0,
|
||||
PrevTimeLineID: 0,
|
||||
fullPageWrites: false,
|
||||
nextXid: 0,
|
||||
nextOid: 0,
|
||||
nextMulti: 0,
|
||||
nextMultiOffset: 0,
|
||||
oldestXid: 0,
|
||||
oldestXidDB: 0,
|
||||
oldestMulti: 0,
|
||||
oldestMultiDB: 0,
|
||||
time: 0,
|
||||
oldestCommitTsXid: 0,
|
||||
newestCommitTsXid: 0,
|
||||
oldestActiveXid: 0,
|
||||
}
|
||||
},
|
||||
unloggedLSN: 0,
|
||||
minRecoveryPoint: 0,
|
||||
minRecoveryPointTLI: 0,
|
||||
backupStartPoint: 0,
|
||||
backupEndPoint: 0,
|
||||
backupEndRequired: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_pg_control(mut buf: Bytes) -> ControlFileDataZenith {
|
||||
info!("decode pg_control");
|
||||
|
||||
let controlfile: ControlFileDataZenith = ControlFileDataZenith {
|
||||
system_identifier: buf.get_u64_le(),
|
||||
pg_control_version: buf.get_u32_le(),
|
||||
catalog_version_no: buf.get_u32_le(),
|
||||
state: buf.get_i32_le(),
|
||||
time: {
|
||||
buf.advance(4);
|
||||
buf.get_i64_le()
|
||||
},
|
||||
checkPoint: buf.get_u64_le(),
|
||||
checkPointCopy: {
|
||||
CheckPoint {
|
||||
redo: buf.get_u64_le(),
|
||||
ThisTimeLineID: buf.get_u32_le(),
|
||||
PrevTimeLineID: buf.get_u32_le(),
|
||||
fullPageWrites: buf.get_u8() != 0,
|
||||
nextXid: {
|
||||
buf.advance(7);
|
||||
buf.get_u64_le()
|
||||
},
|
||||
nextOid: buf.get_u32_le(),
|
||||
nextMulti: buf.get_u32_le(),
|
||||
nextMultiOffset: buf.get_u32_le(),
|
||||
oldestXid: buf.get_u32_le(),
|
||||
oldestXidDB: buf.get_u32_le(),
|
||||
oldestMulti: buf.get_u32_le(),
|
||||
oldestMultiDB: buf.get_u32_le(),
|
||||
time: {
|
||||
buf.advance(4);
|
||||
buf.get_u64_le()
|
||||
},
|
||||
oldestCommitTsXid: buf.get_u32_le(),
|
||||
newestCommitTsXid: buf.get_u32_le(),
|
||||
oldestActiveXid: buf.get_u32_le(),
|
||||
}
|
||||
},
|
||||
unloggedLSN: buf.get_u64_le(),
|
||||
minRecoveryPoint: buf.get_u64_le(),
|
||||
minRecoveryPointTLI: buf.get_u32_le(),
|
||||
backupStartPoint: {
|
||||
buf.advance(4);
|
||||
buf.get_u64_le()
|
||||
},
|
||||
backupEndPoint: buf.get_u64_le(),
|
||||
backupEndRequired: buf.get_u8() != 0,
|
||||
};
|
||||
|
||||
return controlfile;
|
||||
}
|
||||
|
||||
pub fn parse_controlfile(b: Bytes) {
|
||||
let controlfile = decode_pg_control(b);
|
||||
|
||||
info!(
|
||||
"controlfile {:X}/{:X}",
|
||||
controlfile.checkPoint >> 32,
|
||||
controlfile.checkPoint
|
||||
);
|
||||
info!("controlfile {:?}", controlfile);
|
||||
}
|
||||
|
||||
const MAX_MAPPINGS: usize = 62;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RelMapping {
|
||||
mapoid: u32, /* OID of a catalog */
|
||||
mapfilenode: u32, /* its filenode number */
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RelMapFile {
|
||||
magic: i32, /* always RELMAPPER_FILEMAGIC */
|
||||
num_mappings: i32, /* number of valid RelMapping entries */
|
||||
mappings: [u8; MAX_MAPPINGS * 8],
|
||||
crc: u32, /* CRC of all above */
|
||||
pad: i32, /* to make the struct size be 512 exactly */
|
||||
}
|
||||
|
||||
pub fn decode_filemapping(mut buf: Bytes) -> RelMapFile {
|
||||
info!("decode filemap");
|
||||
|
||||
let file: RelMapFile = RelMapFile {
|
||||
magic: buf.get_i32_le(), /* always RELMAPPER_FILEMAGIC */
|
||||
num_mappings: buf.get_i32_le(), /* number of valid RelMapping entries */
|
||||
mappings: {
|
||||
let mut arr = [0 as u8; MAX_MAPPINGS * 8];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
arr
|
||||
},
|
||||
crc: buf.get_u32_le(), /* CRC of all above */
|
||||
pad: buf.get_i32_le(),
|
||||
};
|
||||
|
||||
info!("decode filemap {:?}", file);
|
||||
file
|
||||
}
|
||||
|
||||
pub fn write_buf_to_file(filepath: String, buf: Bytes, blkno: u32) {
|
||||
info!("write_buf_to_file {}", filepath.clone());
|
||||
|
||||
let mut buffer = File::create(filepath.clone()).unwrap();
|
||||
buffer.seek(SeekFrom::Start(8192 * blkno as u64)).unwrap();
|
||||
|
||||
buffer.write_all(&buf).unwrap();
|
||||
|
||||
info!("DONE write_buf_to_file {}", filepath);
|
||||
}
|
||||
@@ -1,12 +1,17 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use std::fmt;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod controlfile;
|
||||
pub mod basebackup;
|
||||
pub mod branches;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
#[allow(dead_code)]
|
||||
pub mod pg_constants;
|
||||
pub mod restore_s3;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod tui;
|
||||
pub mod tui_event;
|
||||
mod tui_logger;
|
||||
@@ -14,13 +19,120 @@ pub mod waldecoder;
|
||||
pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PageServerConf {
|
||||
pub data_dir: PathBuf,
|
||||
pub daemonize: bool,
|
||||
pub interactive: bool,
|
||||
pub wal_producer_connstr: Option<String>,
|
||||
pub listen_addr: SocketAddr,
|
||||
pub skip_recovery: bool,
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
|
||||
// Repository directory, relative to current working directory.
|
||||
// Normally, the page server changes the current working directory
|
||||
// to the repository, and 'workdir' is always '.'. But we don't do
|
||||
// that during unit testing, because the current directory is global
|
||||
// to the process but different unit tests work on different
|
||||
// repositories.
|
||||
pub workdir: PathBuf,
|
||||
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
//
|
||||
// Repository paths, relative to workdir.
|
||||
//
|
||||
|
||||
fn tag_path(&self, name: &str) -> PathBuf {
|
||||
self.workdir.join("refs").join("tags").join(name)
|
||||
}
|
||||
|
||||
fn branch_path(&self, name: &str) -> PathBuf {
|
||||
self.workdir.join("refs").join("branches").join(name)
|
||||
}
|
||||
|
||||
fn timeline_path(&self, timelineid: ZTimelineId) -> PathBuf {
|
||||
self.workdir.join("timelines").join(timelineid.to_string())
|
||||
}
|
||||
|
||||
fn snapshots_path(&self, timelineid: ZTimelineId) -> PathBuf {
|
||||
self.timeline_path(timelineid).join("snapshots")
|
||||
}
|
||||
|
||||
fn ancestor_path(&self, timelineid: ZTimelineId) -> PathBuf {
|
||||
self.timeline_path(timelineid).join("ancestor")
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
|
||||
pub fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
}
|
||||
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
}
|
||||
|
||||
/// Zenith Timeline ID is a 128-bit random ID.
|
||||
///
|
||||
/// Zenith timeline IDs are different from PostgreSQL timeline
|
||||
/// IDs. They serve a similar purpose though: they differentiate
|
||||
/// between different "histories" of the same cluster. However,
|
||||
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
|
||||
/// 32-bits wide, and they must be in ascending order in any given
|
||||
/// timeline history. Those limitations mean that we cannot generate a
|
||||
/// new PostgreSQL timeline ID by just generating a random number. And
|
||||
/// that in turn is problematic for the "pull/push" workflow, where you
|
||||
/// have a local copy of a zenith repository, and you periodically sync
|
||||
/// the local changes with a remote server. When you work "detached"
|
||||
/// from the remote server, you cannot create a PostgreSQL timeline ID
|
||||
/// that's guaranteed to be different from all existing timelines in
|
||||
/// the remote server. For example, if two people are having a clone of
|
||||
/// the repository on their laptops, and they both create a new branch
|
||||
/// with different name. What timeline ID would they assign to their
|
||||
/// branches? If they pick the same one, and later try to push the
|
||||
/// branches to the same remote server, they will get mixed up.
|
||||
///
|
||||
/// To avoid those issues, Zenith has its own concept of timelines that
|
||||
/// is separate from PostgreSQL timelines, and doesn't have those
|
||||
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
||||
/// is usually printed out as a hex string.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct ZTimelineId([u8; 16]);
|
||||
|
||||
impl FromStr for ZTimelineId {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<ZTimelineId, Self::Err> {
|
||||
let timelineid = hex::decode(s)?;
|
||||
|
||||
let mut buf: [u8; 16] = [0u8; 16];
|
||||
buf.copy_from_slice(timelineid.as_slice());
|
||||
Ok(ZTimelineId(buf))
|
||||
}
|
||||
}
|
||||
|
||||
impl ZTimelineId {
|
||||
pub fn from(b: [u8; 16]) -> ZTimelineId {
|
||||
ZTimelineId(b)
|
||||
}
|
||||
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZTimelineId {
|
||||
let mut arr = [0u8; 16];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
ZTimelineId::from(arr)
|
||||
}
|
||||
|
||||
pub fn as_arr(&self) -> [u8; 16] {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ZTimelineId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(&hex::encode(self.0))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,793 +1,32 @@
|
||||
//
|
||||
// Page Cache holds all the different page versions and WAL records
|
||||
//
|
||||
// The Page Cache is a BTreeMap, keyed by the RelFileNode an blocknumber, and the LSN.
|
||||
// The BTreeMap is protected by a Mutex, and each cache entry is protected by another
|
||||
// per-entry mutex.
|
||||
//
|
||||
//! This module acts as a switchboard to access different repositories managed by this
|
||||
//! page server. Currently, a Page Server can only manage one repository, so there
|
||||
//! isn't much here. If we implement multi-tenancy, this will probably be changed into
|
||||
//! a hash map, keyed by the tenant ID.
|
||||
|
||||
use core::ops::Bound::Included;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::{convert::TryInto, ops::AddAssign};
|
||||
|
||||
use std::error::Error;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
// use tokio::sync::RwLock;
|
||||
use bytes::Bytes;
|
||||
use crate::repository::rocksdb::RocksRepository;
|
||||
use crate::repository::Repository;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use rand::Rng;
|
||||
|
||||
use crate::{controlfile, walredo, PageServerConf};
|
||||
|
||||
use crossbeam_channel::unbounded;
|
||||
use crossbeam_channel::{Receiver, Sender};
|
||||
|
||||
// Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
static TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
pub struct PageCache {
|
||||
shared: Mutex<PageCacheShared>,
|
||||
|
||||
// Channel for communicating with the WAL redo process here.
|
||||
pub walredo_sender: Sender<Arc<CacheEntry>>,
|
||||
pub walredo_receiver: Receiver<Arc<CacheEntry>>,
|
||||
|
||||
valid_lsn_condvar: Condvar,
|
||||
|
||||
// Counters, for metrics collection.
|
||||
pub num_entries: AtomicU64,
|
||||
pub num_page_images: AtomicU64,
|
||||
pub num_wal_records: AtomicU64,
|
||||
pub num_getpage_requests: AtomicU64,
|
||||
|
||||
// copies of shared.first/last_valid_lsn fields (copied here so
|
||||
// that they can be read without acquiring the mutex).
|
||||
pub first_valid_lsn: AtomicU64,
|
||||
pub last_valid_lsn: AtomicU64,
|
||||
pub last_record_lsn: AtomicU64,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PageCacheStats {
|
||||
pub num_entries: u64,
|
||||
pub num_page_images: u64,
|
||||
pub num_wal_records: u64,
|
||||
pub num_getpage_requests: u64,
|
||||
pub first_valid_lsn: u64,
|
||||
pub last_valid_lsn: u64,
|
||||
pub last_record_lsn: u64,
|
||||
}
|
||||
|
||||
impl AddAssign for PageCacheStats {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
*self = Self {
|
||||
num_entries: self.num_entries + other.num_entries,
|
||||
num_page_images: self.num_page_images + other.num_page_images,
|
||||
num_wal_records: self.num_wal_records + other.num_wal_records,
|
||||
num_getpage_requests: self.num_getpage_requests + other.num_getpage_requests,
|
||||
first_valid_lsn: self.first_valid_lsn + other.first_valid_lsn,
|
||||
last_valid_lsn: self.last_valid_lsn + other.last_valid_lsn,
|
||||
last_record_lsn: self.last_record_lsn + other.last_record_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Shared data structure, holding page cache and related auxiliary information
|
||||
//
|
||||
struct PageCacheShared {
|
||||
// The actual page cache
|
||||
pagecache: BTreeMap<CacheKey, Arc<CacheEntry>>,
|
||||
|
||||
// Relation n_blocks cache
|
||||
//
|
||||
// This hashtable should be updated together with the pagecache. Now it is
|
||||
// accessed unreasonably often through the smgr_nblocks(). It is better to just
|
||||
// cache it in postgres smgr and ask only on restart.
|
||||
relsize_cache: HashMap<RelTag, u32>,
|
||||
|
||||
// What page versions do we hold in the cache? If we get GetPage with
|
||||
// LSN < first_valid_lsn, that's an error because we (no longer) hold that
|
||||
// page version. If we get a request > last_valid_lsn, we need to wait until
|
||||
// we receive all the WAL up to the request.
|
||||
//
|
||||
// last_record_lsn points to the end of last processed WAL record.
|
||||
// It can lag behind last_valid_lsn, if the WAL receiver has received some WAL
|
||||
// after the end of last record, but not the whole next record yet. In the
|
||||
// page cache, we care about last_valid_lsn, but if the WAL receiver needs to
|
||||
// restart the streaming, it needs to restart at the end of last record, so
|
||||
// we track them separately. last_record_lsn should perhaps be in
|
||||
// walreceiver.rs instead of here, but it seems convenient to keep all three
|
||||
// values together.
|
||||
//
|
||||
first_valid_lsn: u64,
|
||||
last_valid_lsn: u64,
|
||||
last_record_lsn: u64,
|
||||
|
||||
controldata: controlfile::ControlFileDataZenith,
|
||||
}
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
lazy_static! {
|
||||
pub static ref PAGECACHES: Mutex<HashMap<u64, Arc<PageCache>>> = Mutex::new(HashMap::new());
|
||||
pub static ref REPOSITORY: Mutex<Option<Arc<dyn Repository + Send + Sync>>> = Mutex::new(None);
|
||||
}
|
||||
|
||||
pub fn get_pagecache(conf: PageServerConf, sys_id: u64) -> Arc<PageCache> {
|
||||
let mut pcaches = PAGECACHES.lock().unwrap();
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
if !pcaches.contains_key(&sys_id) {
|
||||
pcaches.insert(sys_id, Arc::new(init_page_cache()));
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf);
|
||||
|
||||
// Initialize the WAL redo thread
|
||||
//
|
||||
// Now join_handle is not saved any where and we won'try restart tharead
|
||||
// if it is dead. We may later stop that treads after some inactivity period
|
||||
// and restart them on demand.
|
||||
let _walredo_thread = thread::Builder::new()
|
||||
.name("WAL redo thread".into())
|
||||
.spawn(move || {
|
||||
walredo::wal_redo_main(conf, sys_id);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
// we have already changed current dir to the repository.
|
||||
let repo = RocksRepository::new(conf, Arc::new(walredo_mgr));
|
||||
|
||||
pcaches.get(&sys_id).unwrap().clone()
|
||||
*m = Some(Arc::new(repo));
|
||||
}
|
||||
|
||||
fn init_page_cache() -> PageCache {
|
||||
// Initialize the channel between the page cache and the WAL applicator
|
||||
let (s, r) = unbounded();
|
||||
|
||||
PageCache {
|
||||
shared: Mutex::new(PageCacheShared {
|
||||
pagecache: BTreeMap::new(),
|
||||
relsize_cache: HashMap::new(),
|
||||
first_valid_lsn: 0,
|
||||
last_valid_lsn: 0,
|
||||
last_record_lsn: 0,
|
||||
controldata: controlfile::ControlFileDataZenith::new(),
|
||||
}),
|
||||
valid_lsn_condvar: Condvar::new(),
|
||||
|
||||
walredo_sender: s,
|
||||
walredo_receiver: r,
|
||||
|
||||
num_entries: AtomicU64::new(0),
|
||||
num_page_images: AtomicU64::new(0),
|
||||
num_wal_records: AtomicU64::new(0),
|
||||
num_getpage_requests: AtomicU64::new(0),
|
||||
|
||||
first_valid_lsn: AtomicU64::new(0),
|
||||
last_valid_lsn: AtomicU64::new(0),
|
||||
last_record_lsn: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// We store two kinds of entries in the page cache:
|
||||
//
|
||||
// 1. Ready-made images of the block
|
||||
// 2. WAL records, to be applied on top of the "previous" entry
|
||||
//
|
||||
// Some WAL records will initialize the page from scratch. For such records,
|
||||
// the 'will_init' flag is set. They don't need the previous page image before
|
||||
// applying. The 'will_init' flag is set for records containing a full-page image,
|
||||
// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
|
||||
// stored directly in the cache entry in that you still need to run the WAL redo
|
||||
// routine to generate the page image.
|
||||
//
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)]
|
||||
pub struct CacheKey {
|
||||
pub tag: BufferTag,
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
pub struct CacheEntry {
|
||||
pub key: CacheKey,
|
||||
|
||||
pub content: Mutex<CacheEntryContent>,
|
||||
|
||||
// Condition variable used by the WAL redo service, to wake up
|
||||
// requester.
|
||||
//
|
||||
// FIXME: this takes quite a lot of space. Consider using parking_lot::Condvar
|
||||
// or something else.
|
||||
pub walredo_condvar: Condvar,
|
||||
}
|
||||
|
||||
pub struct CacheEntryContent {
|
||||
pub page_image: Option<Bytes>,
|
||||
pub wal_record: Option<WALRecord>,
|
||||
pub apply_pending: bool,
|
||||
}
|
||||
|
||||
impl CacheEntry {
|
||||
fn new(key: CacheKey) -> CacheEntry {
|
||||
CacheEntry {
|
||||
key: key,
|
||||
content: Mutex::new(CacheEntryContent {
|
||||
page_image: None,
|
||||
wal_record: None,
|
||||
apply_pending: false,
|
||||
}),
|
||||
walredo_condvar: Condvar::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Clone, Copy, Debug)]
|
||||
pub struct RelTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u8,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug)]
|
||||
pub struct BufferTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u8,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: u64, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
pub rec: Bytes,
|
||||
}
|
||||
|
||||
// Public interface functions
|
||||
|
||||
impl PageCache {
|
||||
pub fn get_nonrel_page(&self, tag: BufferTag, _reqlsn: u64) -> Result<Bytes, Box<dyn Error>> {
|
||||
self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// Now we don't have versioning for non-rel pages.
|
||||
// Also at bootstrap we don't know lsn for some files.
|
||||
// So always request the very latest version
|
||||
// let lsn = reqlsn;
|
||||
|
||||
let lsn = u64::MAX;
|
||||
|
||||
let minkey = CacheKey { tag: tag, lsn: 0 };
|
||||
// Look up to the largest lsn
|
||||
let maxkey = CacheKey { tag: tag, lsn: lsn };
|
||||
|
||||
let entry_rc: Arc<CacheEntry>;
|
||||
{
|
||||
let shared = self.shared.lock().unwrap();
|
||||
|
||||
let pagecache = &shared.pagecache;
|
||||
info!("got pagecache {}", pagecache.len());
|
||||
|
||||
let mut entries = pagecache.range((Included(&minkey), Included(&maxkey)));
|
||||
|
||||
let entry_opt = entries.next_back();
|
||||
|
||||
if entry_opt.is_none() {
|
||||
return Err(format!(
|
||||
"not found non-rel page with LSN {} for {}/{}/{}.{} blk {}",
|
||||
lsn, tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum
|
||||
))?;
|
||||
}
|
||||
|
||||
info!(
|
||||
"found non-rel page with LSN {} for {}/{}/{}.{} blk {}",
|
||||
lsn, tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum
|
||||
);
|
||||
|
||||
let (_key, entry) = entry_opt.unwrap();
|
||||
entry_rc = entry.clone();
|
||||
|
||||
// Now that we have a reference to the cache entry, drop the lock on the map.
|
||||
// It's important to do this before waiting on the condition variable below,
|
||||
// and better to do it as soon as possible to maximize concurrency.
|
||||
}
|
||||
|
||||
// Lock the cache entry and dig the page image out of it.
|
||||
let page_img: Bytes;
|
||||
{
|
||||
let entry_content = entry_rc.content.lock().unwrap();
|
||||
|
||||
if let Some(img) = &entry_content.page_image {
|
||||
assert!(!entry_content.apply_pending);
|
||||
page_img = img.clone();
|
||||
} else if entry_content.wal_record.is_some() {
|
||||
return Err("non-rel WAL redo is not implemented yet".into());
|
||||
//
|
||||
// If this page needs to be reconstructed by applying some WAL,
|
||||
// send a request to the WAL redo thread.
|
||||
//
|
||||
// if !entry_content.apply_pending {
|
||||
// assert!(!entry_content.apply_pending);
|
||||
// entry_content.apply_pending = true;
|
||||
|
||||
// let s = &self.walredo_sender;
|
||||
// s.send(entry_rc.clone())?;
|
||||
// }
|
||||
|
||||
// while entry_content.apply_pending {
|
||||
// entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap();
|
||||
//}
|
||||
|
||||
// We should now have a page image. If we don't, it means that WAL redo
|
||||
// failed to reconstruct it. WAL redo should've logged that error already.
|
||||
// page_img = match &entry_content.page_image {
|
||||
// Some(p) => p.clone(),
|
||||
// None => {
|
||||
// error!("could not apply WAL to reconstruct page image for GetPage@LSN request");
|
||||
// return Err("could not apply WAL to reconstruct page image".into());
|
||||
// }
|
||||
// };
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
return Err(format!("no page image or WAL record for requested page"))?;
|
||||
}
|
||||
}
|
||||
|
||||
trace!(
|
||||
"Returning page for {}/{}/{}.{} blk {}",
|
||||
tag.spcnode,
|
||||
tag.dbnode,
|
||||
tag.relnode,
|
||||
tag.forknum,
|
||||
tag.blknum
|
||||
);
|
||||
|
||||
return Ok(page_img);
|
||||
}
|
||||
|
||||
//
|
||||
// GetPage@LSN
|
||||
//
|
||||
// Returns an 8k page image
|
||||
//
|
||||
pub fn get_page_at_lsn(&self, tag: BufferTag, reqlsn: u64) -> Result<Bytes, Box<dyn Error>> {
|
||||
let mut lsn = reqlsn;
|
||||
|
||||
if tag.forknum > 40 {
|
||||
info!(
|
||||
"get_page_at_lsn got request for page with LSN {} for {}/{}/{}.{} blk {}",
|
||||
lsn, tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum
|
||||
);
|
||||
|
||||
return self.get_nonrel_page(tag, lsn);
|
||||
}
|
||||
|
||||
if reqlsn == 0 {
|
||||
let c = self.get_controldata();
|
||||
lsn = c.checkPoint;
|
||||
|
||||
info!("update reqlsn get_page_at_lsn got request for page with LSN {} for {}/{}/{}.{} blk {}", lsn,
|
||||
tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum);
|
||||
}
|
||||
|
||||
self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// Look up cache entry. If it's a page image, return that. If it's a WAL record,
|
||||
// ask the WAL redo service to reconstruct the page image from the WAL records.
|
||||
let minkey = CacheKey { tag: tag, lsn: 0 };
|
||||
let maxkey = CacheKey { tag: tag, lsn: lsn };
|
||||
let entry_rc: Arc<CacheEntry>;
|
||||
{
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
let mut waited = false;
|
||||
|
||||
// When server just started and created checkpoint lsn,
|
||||
// but we have not yet established connection,
|
||||
// requested lsn will be larger than the one we have
|
||||
while lsn > shared.last_valid_lsn + 500 {
|
||||
// TODO: Wait for the WAL receiver to catch up
|
||||
waited = true;
|
||||
trace!(
|
||||
"not caught up yet: {}, requested {}",
|
||||
shared.last_valid_lsn,
|
||||
lsn
|
||||
);
|
||||
let wait_result = self
|
||||
.valid_lsn_condvar
|
||||
.wait_timeout(shared, TIMEOUT)
|
||||
.unwrap();
|
||||
|
||||
shared = wait_result.0;
|
||||
if wait_result.1.timed_out() {
|
||||
return Err(format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive",
|
||||
lsn
|
||||
))?;
|
||||
}
|
||||
}
|
||||
if waited {
|
||||
trace!("caught up now, continuing");
|
||||
}
|
||||
|
||||
if lsn < shared.first_valid_lsn {
|
||||
return Err(format!("LSN {} has already been removed", lsn))?;
|
||||
}
|
||||
|
||||
let pagecache = &shared.pagecache;
|
||||
|
||||
let mut entries = pagecache.range((Included(&minkey), Included(&maxkey)));
|
||||
|
||||
let entry_opt = entries.next_back();
|
||||
|
||||
if entry_opt.is_none() {
|
||||
//static ZERO_PAGE:[u8; 8192] = [0 as u8; 8192];
|
||||
//return Ok(Bytes::from_static(&ZERO_PAGE));
|
||||
return Err("could not find page image")?;
|
||||
}
|
||||
let (_key, entry) = entry_opt.unwrap();
|
||||
entry_rc = entry.clone();
|
||||
|
||||
// Now that we have a reference to the cache entry, drop the lock on the map.
|
||||
// It's important to do this before waiting on the condition variable below,
|
||||
// and better to do it as soon as possible to maximize concurrency.
|
||||
}
|
||||
|
||||
// Lock the cache entry and dig the page image out of it.
|
||||
let page_img: Bytes;
|
||||
{
|
||||
let mut entry_content = entry_rc.content.lock().unwrap();
|
||||
|
||||
if let Some(img) = &entry_content.page_image {
|
||||
assert!(!entry_content.apply_pending);
|
||||
page_img = img.clone();
|
||||
} else if entry_content.wal_record.is_some() {
|
||||
//
|
||||
// If this page needs to be reconstructed by applying some WAL,
|
||||
// send a request to the WAL redo thread.
|
||||
//
|
||||
if !entry_content.apply_pending {
|
||||
assert!(!entry_content.apply_pending);
|
||||
entry_content.apply_pending = true;
|
||||
|
||||
let s = &self.walredo_sender;
|
||||
s.send(entry_rc.clone())?;
|
||||
}
|
||||
|
||||
while entry_content.apply_pending {
|
||||
entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap();
|
||||
}
|
||||
|
||||
// We should now have a page image. If we don't, it means that WAL redo
|
||||
// failed to reconstruct it. WAL redo should've logged that error already.
|
||||
page_img = match &entry_content.page_image {
|
||||
Some(p) => p.clone(),
|
||||
None => {
|
||||
error!(
|
||||
"could not apply WAL to reconstruct page image for GetPage@LSN request"
|
||||
);
|
||||
return Err("could not apply WAL to reconstruct page image".into());
|
||||
}
|
||||
};
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
return Err(format!("no page image or WAL record for requested page"))?;
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: assumes little-endian. Only used for the debugging log though
|
||||
let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
|
||||
let page_lsn_lo = u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
|
||||
trace!(
|
||||
"Returning page with LSN {:X}/{:X} for {}/{}/{}.{} blk {}",
|
||||
page_lsn_hi,
|
||||
page_lsn_lo,
|
||||
tag.spcnode,
|
||||
tag.dbnode,
|
||||
tag.relnode,
|
||||
tag.forknum,
|
||||
tag.blknum
|
||||
);
|
||||
|
||||
return Ok(page_img);
|
||||
}
|
||||
|
||||
//
|
||||
// Collect all the WAL records that are needed to reconstruct a page
|
||||
// image for the given cache entry.
|
||||
//
|
||||
// Returns an old page image (if any), and a vector of WAL records to apply
|
||||
// over it.
|
||||
//
|
||||
pub fn collect_records_for_apply(&self, entry: &CacheEntry) -> (Option<Bytes>, Vec<WALRecord>) {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let shared = self.shared.lock().unwrap();
|
||||
let pagecache = &shared.pagecache;
|
||||
|
||||
let minkey = CacheKey {
|
||||
tag: entry.key.tag,
|
||||
lsn: 0,
|
||||
};
|
||||
let maxkey = CacheKey {
|
||||
tag: entry.key.tag,
|
||||
lsn: entry.key.lsn,
|
||||
};
|
||||
let entries = pagecache.range((Included(&minkey), Included(&maxkey)));
|
||||
|
||||
// the last entry in the range should be the CacheEntry we were given
|
||||
//let _last_entry = entries.next_back();
|
||||
//assert!(last_entry == entry);
|
||||
|
||||
let mut base_img: Option<Bytes> = None;
|
||||
let mut records: Vec<WALRecord> = Vec::new();
|
||||
|
||||
// Scan backwards, collecting the WAL records, until we hit an
|
||||
// old page image.
|
||||
for (_key, e) in entries.rev() {
|
||||
let e = e.content.lock().unwrap();
|
||||
|
||||
if let Some(img) = &e.page_image {
|
||||
// We have a base image. No need to dig deeper into the list of
|
||||
// records
|
||||
base_img = Some(img.clone());
|
||||
break;
|
||||
} else if let Some(rec) = &e.wal_record {
|
||||
records.push(rec.clone());
|
||||
|
||||
// If this WAL record initializes the page, no need to dig deeper.
|
||||
if rec.will_init {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
panic!("no base image and no WAL record on cache entry");
|
||||
}
|
||||
}
|
||||
|
||||
records.reverse();
|
||||
return (base_img, records);
|
||||
}
|
||||
|
||||
//
|
||||
// Adds a WAL record to the page cache
|
||||
//
|
||||
pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) {
|
||||
let key = CacheKey {
|
||||
tag: tag,
|
||||
lsn: rec.lsn,
|
||||
};
|
||||
|
||||
let entry = CacheEntry::new(key.clone());
|
||||
entry.content.lock().unwrap().wal_record = Some(rec);
|
||||
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
let rel_tag = RelTag {
|
||||
spcnode: tag.spcnode,
|
||||
dbnode: tag.dbnode,
|
||||
relnode: tag.relnode,
|
||||
forknum: tag.forknum,
|
||||
};
|
||||
let rel_entry = shared.relsize_cache.entry(rel_tag).or_insert(0);
|
||||
if tag.blknum >= *rel_entry {
|
||||
*rel_entry = tag.blknum + 1;
|
||||
}
|
||||
|
||||
trace!("put_wal_record lsn: {}", key.lsn);
|
||||
|
||||
let oldentry = shared.pagecache.insert(key, Arc::new(entry));
|
||||
self.num_entries.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
if !oldentry.is_none() {
|
||||
error!("overwriting WAL record in page cache");
|
||||
}
|
||||
|
||||
self.num_wal_records.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
//
|
||||
// Memorize a full image of a page version
|
||||
//
|
||||
pub fn put_page_image(&self, tag: BufferTag, lsn: u64, img: Bytes) {
|
||||
let key = CacheKey { tag: tag, lsn: lsn };
|
||||
|
||||
let entry = CacheEntry::new(key.clone());
|
||||
entry.content.lock().unwrap().page_image = Some(img);
|
||||
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
let pagecache = &mut shared.pagecache;
|
||||
|
||||
let oldentry = pagecache.insert(key, Arc::new(entry));
|
||||
self.num_entries.fetch_add(1, Ordering::Relaxed);
|
||||
assert!(oldentry.is_none());
|
||||
|
||||
debug!(
|
||||
"inserted page image for {}/{}/{}_{} blk {} at {}",
|
||||
tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn
|
||||
);
|
||||
|
||||
self.num_page_images.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
//
|
||||
pub fn advance_last_valid_lsn(&self, lsn: u64) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
// Can't move backwards.
|
||||
assert!(lsn >= shared.last_valid_lsn);
|
||||
|
||||
shared.last_valid_lsn = lsn;
|
||||
self.valid_lsn_condvar.notify_all();
|
||||
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
//
|
||||
// NOTE: this updates last_valid_lsn as well.
|
||||
//
|
||||
pub fn advance_last_record_lsn(&self, lsn: u64) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
// Can't move backwards.
|
||||
assert!(lsn >= shared.last_valid_lsn);
|
||||
assert!(lsn >= shared.last_record_lsn);
|
||||
|
||||
shared.last_valid_lsn = lsn;
|
||||
shared.last_record_lsn = lsn;
|
||||
self.valid_lsn_condvar.notify_all();
|
||||
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
//
|
||||
pub fn _advance_first_valid_lsn(&self, lsn: u64) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
// Can't move backwards.
|
||||
assert!(lsn >= shared.first_valid_lsn);
|
||||
|
||||
// Can't overtake last_valid_lsn (except when we're
|
||||
// initializing the system and last_valid_lsn hasn't been set yet.
|
||||
assert!(shared.last_valid_lsn == 0 || lsn < shared.last_valid_lsn);
|
||||
|
||||
shared.first_valid_lsn = lsn;
|
||||
self.first_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn init_valid_lsn(&self, lsn: u64) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
assert!(shared.first_valid_lsn == 0);
|
||||
assert!(shared.last_valid_lsn == 0);
|
||||
assert!(shared.last_record_lsn == 0);
|
||||
|
||||
shared.first_valid_lsn = lsn;
|
||||
shared.last_valid_lsn = lsn;
|
||||
shared.last_record_lsn = lsn;
|
||||
|
||||
self.first_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
self.last_record_lsn.store(lsn, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn get_last_valid_lsn(&self) -> u64 {
|
||||
let shared = self.shared.lock().unwrap();
|
||||
|
||||
return shared.last_record_lsn;
|
||||
}
|
||||
|
||||
pub fn set_controldata(&self, c: controlfile::ControlFileDataZenith) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
shared.controldata = c;
|
||||
}
|
||||
|
||||
pub fn get_controldata(&self) -> controlfile::ControlFileDataZenith {
|
||||
let shared = self.shared.lock().unwrap();
|
||||
return shared.controldata.clone();
|
||||
}
|
||||
|
||||
//
|
||||
// Simple test function for the WAL redo code:
|
||||
//
|
||||
// 1. Pick a page from the page cache at random.
|
||||
// 2. Request that page with GetPage@LSN, using Max LSN (i.e. get the latest page version)
|
||||
//
|
||||
//
|
||||
pub fn _test_get_page_at_lsn(&self) {
|
||||
// for quick testing of the get_page_at_lsn() funcion.
|
||||
//
|
||||
// Get a random page from the page cache. Apply all its WAL, by requesting
|
||||
// that page at the highest lsn.
|
||||
|
||||
let mut tag: Option<BufferTag> = None;
|
||||
|
||||
{
|
||||
let shared = self.shared.lock().unwrap();
|
||||
let pagecache = &shared.pagecache;
|
||||
|
||||
if pagecache.is_empty() {
|
||||
info!("page cache is empty");
|
||||
return;
|
||||
}
|
||||
|
||||
// Find nth entry in the map, where n is picked at random
|
||||
let n = rand::thread_rng().gen_range(0..pagecache.len());
|
||||
let mut i = 0;
|
||||
for (key, _e) in pagecache.iter() {
|
||||
if i == n {
|
||||
tag = Some(key.tag);
|
||||
break;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
info!("testing GetPage@LSN for block {}", tag.unwrap().blknum);
|
||||
match self.get_page_at_lsn(tag.unwrap(), 0xffff_ffff_ffff_eeee) {
|
||||
Ok(_img) => {
|
||||
// This prints out the whole page image.
|
||||
//println!("{:X?}", img);
|
||||
}
|
||||
Err(error) => {
|
||||
error!("GetPage@LSN failed: {}", error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Shouldn't relation size also be tracked with an LSN?
|
||||
// If a replica is lagging behind, it needs to get the size as it was on
|
||||
// the replica's current replay LSN.
|
||||
pub fn relsize_inc(&self, rel: &RelTag, to: Option<u32>) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
let entry = shared.relsize_cache.entry(*rel).or_insert(0);
|
||||
|
||||
if let Some(to) = to {
|
||||
if to >= *entry {
|
||||
*entry = to + 1;
|
||||
}
|
||||
}
|
||||
trace!("relsize_inc {:?} to {}", rel, entry);
|
||||
}
|
||||
|
||||
pub fn relsize_get(&self, rel: &RelTag) -> u32 {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
let entry = shared.relsize_cache.entry(*rel).or_insert(0);
|
||||
*entry
|
||||
}
|
||||
|
||||
pub fn relsize_exist(&self, rel: &RelTag) -> bool {
|
||||
let shared = self.shared.lock().unwrap();
|
||||
let relsize_cache = &shared.relsize_cache;
|
||||
relsize_cache.contains_key(rel)
|
||||
}
|
||||
|
||||
pub fn get_stats(&self) -> PageCacheStats {
|
||||
PageCacheStats {
|
||||
num_entries: self.num_entries.load(Ordering::Relaxed),
|
||||
num_page_images: self.num_page_images.load(Ordering::Relaxed),
|
||||
num_wal_records: self.num_wal_records.load(Ordering::Relaxed),
|
||||
num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed),
|
||||
first_valid_lsn: self.first_valid_lsn.load(Ordering::Relaxed),
|
||||
last_valid_lsn: self.last_valid_lsn.load(Ordering::Relaxed),
|
||||
last_record_lsn: self.last_record_lsn.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_stats() -> PageCacheStats {
|
||||
let pcaches = PAGECACHES.lock().unwrap();
|
||||
|
||||
let mut stats = PageCacheStats {
|
||||
num_entries: 0,
|
||||
num_page_images: 0,
|
||||
num_wal_records: 0,
|
||||
num_getpage_requests: 0,
|
||||
first_valid_lsn: 0,
|
||||
last_valid_lsn: 0,
|
||||
last_record_lsn: 0,
|
||||
};
|
||||
|
||||
pcaches.iter().for_each(|(_sys_id, pcache)| {
|
||||
stats += pcache.get_stats();
|
||||
});
|
||||
stats
|
||||
pub fn get_repository() -> Arc<dyn Repository + Send + Sync> {
|
||||
let o = &REPOSITORY.lock().unwrap();
|
||||
Arc::clone(o.as_ref().unwrap())
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
pub const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
//Special values for non-rel files' tags
|
||||
//TODO maybe use enum?
|
||||
pub const PG_CONTROLFILE_FORKNUM: u32 = 42;
|
||||
pub const PG_FILENODEMAP_FORKNUM: u32 = 43;
|
||||
pub const PG_XACT_FORKNUM: u32 = 44;
|
||||
pub const PG_MXACT_OFFSETS_FORKNUM: u32 = 45;
|
||||
pub const PG_MXACT_MEMBERS_FORKNUM: u32 = 46;
|
||||
532
pageserver/src/repository.rs
Normal file
532
pageserver/src/repository.rs
Normal file
@@ -0,0 +1,532 @@
|
||||
pub mod rocksdb;
|
||||
|
||||
use crate::waldecoder::{DecodedWALRecord, Oid, TransactionId, XlCreateDatabase, XlSmgrTruncate};
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// A repository corresponds to one .zenith directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
pub trait Repository {
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
///
|
||||
/// The Timeline is expected to be already "open", i.e. `get_or_restore_timeline`
|
||||
/// should've been called on it earlier already.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
///
|
||||
/// Creates a new Timeline object if it's not "open" already.
|
||||
fn get_or_restore_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Create an empty timeline, without loading any data into it from possible on-disk snapshot.
|
||||
///
|
||||
/// For unit tests.
|
||||
#[cfg(test)]
|
||||
fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
//fn get_stats(&self) -> RepositoryStats;
|
||||
}
|
||||
|
||||
pub trait Timeline {
|
||||
//------------------------------------------------------------------------------
|
||||
// Public GET functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn(&self, tag: BufferTag, lsn: Lsn) -> Result<Bytes>;
|
||||
|
||||
/// Get size of relation
|
||||
fn get_relsize(&self, tag: RelTag, lsn: Lsn) -> Result<u32>;
|
||||
|
||||
/// Does relation exist?
|
||||
fn get_relsize_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
|
||||
|
||||
/// Get page image at the particular LSN
|
||||
fn get_page_image(&self, tag: BufferTag, lsn: Lsn) -> Result<Option<Bytes>>;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
//
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, tag: BufferTag, rec: WALRecord);
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes);
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// Create a new database from a template database
|
||||
///
|
||||
/// In PostgreSQL, CREATE DATABASE works by scanning the data directory and
|
||||
/// copying all relation files from the template database. This is the equivalent
|
||||
/// of that.
|
||||
fn put_create_database(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
db_id: Oid,
|
||||
tablespace_id: Oid,
|
||||
src_db_id: Oid,
|
||||
src_tablespace_id: Oid,
|
||||
) -> Result<()>;
|
||||
|
||||
///
|
||||
/// Helper function to parse a WAL record and call the above functions for all the
|
||||
/// relations/pages that the record affects.
|
||||
///
|
||||
fn save_decoded_record(
|
||||
&self,
|
||||
decoded: DecodedWALRecord,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
// Figure out which blocks the record applies to, and "put" a separate copy
|
||||
// of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
self.put_wal_record(tag, rec);
|
||||
}
|
||||
|
||||
// Handle a few special record types
|
||||
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_SMGR_TRUNCATE
|
||||
{
|
||||
let truncate = XlSmgrTruncate::decode(&decoded);
|
||||
if (truncate.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 {
|
||||
let rel = RelTag {
|
||||
spcnode: truncate.rnode.spcnode,
|
||||
dbnode: truncate.rnode.dbnode,
|
||||
relnode: truncate.rnode.relnode,
|
||||
forknum: pg_constants::MAIN_FORKNUM,
|
||||
};
|
||||
self.put_truncation(rel, lsn, truncate.blkno)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_DBASE_CREATE
|
||||
{
|
||||
let createdb = XlCreateDatabase::decode(&decoded);
|
||||
self.put_create_database(
|
||||
lsn,
|
||||
createdb.db_id,
|
||||
createdb.tablespace_id,
|
||||
createdb.src_db_id,
|
||||
createdb.src_tablespace_id,
|
||||
)?;
|
||||
}
|
||||
// Now that this record has been handled, let the repository know that
|
||||
// it is up-to-date to this LSN
|
||||
self.advance_last_record_lsn(lsn);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember the all WAL before the given LSN has been processed.
|
||||
///
|
||||
/// The WAL receiver calls this after the put_* functions, to indicate that
|
||||
/// all WAL before this point has been digested. Before that, if you call
|
||||
/// GET on an earlier LSN, it will block.
|
||||
fn advance_last_valid_lsn(&self, lsn: Lsn);
|
||||
fn get_last_valid_lsn(&self) -> Lsn;
|
||||
fn init_valid_lsn(&self, lsn: Lsn);
|
||||
|
||||
/// Like `advance_last_valid_lsn`, but this always points to the end of
|
||||
/// a WAL record, not in the middle of one.
|
||||
///
|
||||
/// This must be <= last valid LSN. This is tracked separately from last
|
||||
/// valid LSN, so that the WAL receiver knows where to restart streaming.
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn);
|
||||
fn get_last_record_lsn(&self) -> Lsn;
|
||||
|
||||
/// Get range [begin,end) of stored blocks. Used mostly for SMGR pseudorelations
|
||||
/// but can be also applied to normal relations.
|
||||
fn get_range(&self, rel: RelTag, lsn: Lsn) -> Result<(u32, u32)>;
|
||||
|
||||
/// Get vector of databases (represented using RelTag only dbnode and spcnode fields are used)
|
||||
fn get_databases(&self, lsn: Lsn) -> Result<Vec<RelTag>>;
|
||||
|
||||
/// Get vector of prepared twophase transactions
|
||||
fn get_twophase(&self, lsn: Lsn) -> Result<Vec<TransactionId>>;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RepositoryStats {
|
||||
pub num_entries: Lsn,
|
||||
pub num_page_images: Lsn,
|
||||
pub num_wal_records: Lsn,
|
||||
pub num_getpage_requests: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
}
|
||||
|
||||
impl RelTag {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u8(self.forknum);
|
||||
buf.put_u32(self.spcnode);
|
||||
buf.put_u32(self.dbnode);
|
||||
buf.put_u32(self.relnode);
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> RelTag {
|
||||
RelTag {
|
||||
forknum: buf.get_u8(),
|
||||
spcnode: buf.get_u32(),
|
||||
dbnode: buf.get_u32(),
|
||||
relnode: buf.get_u32(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
impl fmt::Display for RelTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||
write!(
|
||||
f,
|
||||
"{}/{}/{}_{}",
|
||||
self.spcnode, self.dbnode, self.relnode, forkname
|
||||
)
|
||||
} else {
|
||||
write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
impl BufferTag {
|
||||
pub fn fork(forknum: u8) -> BufferTag {
|
||||
BufferTag {
|
||||
rel: RelTag {
|
||||
forknum,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
},
|
||||
blknum: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
self.rel.pack(buf);
|
||||
buf.put_u32(self.blknum);
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> BufferTag {
|
||||
BufferTag {
|
||||
rel: RelTag::unpack(buf),
|
||||
blknum: buf.get_u32(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
pub rec: Bytes,
|
||||
// Remember the offset of main_data in rec,
|
||||
// so that we don't have to parse the record again.
|
||||
// If record has no main_data, this offset equals rec.len().
|
||||
pub main_data_offset: u32,
|
||||
}
|
||||
|
||||
impl WALRecord {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u64(self.lsn.0);
|
||||
buf.put_u8(self.will_init as u8);
|
||||
buf.put_u32(self.main_data_offset);
|
||||
buf.put_u32(self.rec.len() as u32);
|
||||
buf.put_slice(&self.rec[..]);
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> WALRecord {
|
||||
let lsn = Lsn::from(buf.get_u64());
|
||||
let will_init = buf.get_u8() != 0;
|
||||
let main_data_offset = buf.get_u32();
|
||||
let mut dst = vec![0u8; buf.get_u32() as usize];
|
||||
buf.copy_to_slice(&mut dst);
|
||||
WALRecord {
|
||||
lsn,
|
||||
will_init,
|
||||
rec: Bytes::from(dst),
|
||||
main_data_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Tests that should work the same with any Repository/Timeline implementation.
|
||||
///
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelTag = RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1000,
|
||||
forknum: 0,
|
||||
};
|
||||
|
||||
/// Convenience function to create a BufferTag for testing.
|
||||
/// Helps to keeps the tests shorter.
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_BUF(blknum: u32) -> BufferTag {
|
||||
BufferTag {
|
||||
rel: TESTREL_A,
|
||||
blknum,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_IMG(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(8192, 0);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
|
||||
let conf = PageServerConf {
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
gc_horizon: 64 * 1024 * 1024,
|
||||
gc_period: Duration::from_secs(10),
|
||||
listen_addr: "127.0.0.1:5430".parse().unwrap(),
|
||||
workdir: repo_dir.into(),
|
||||
pg_distrib_dir: "".into(),
|
||||
};
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
|
||||
let repo = rocksdb::RocksRepository::new(conf, Arc::new(walredo_mgr));
|
||||
|
||||
Ok(Box::new(repo))
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation.
|
||||
///
|
||||
/// FIXME: The RocksRepository implementation returns wrong relation size, if
|
||||
/// you make a request with an old LSN. It seems to ignore the requested LSN
|
||||
/// and always return result as of latest LSN. For such cases, the expected
|
||||
/// results below match the current RocksRepository behavior, so that the test
|
||||
/// passes, and the actually correct answers are in comments like
|
||||
/// "// CORRECT: <correct answer>"
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
// Create timeline to work on
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"));
|
||||
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"));
|
||||
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"));
|
||||
|
||||
tline.advance_last_valid_lsn(Lsn(5));
|
||||
|
||||
// rocksdb implementation erroneosly returns 'true' here
|
||||
assert_eq!(tline.get_relsize_exists(TESTREL_A, Lsn(1))?, true); // CORRECT: false
|
||||
// likewise, it returns wrong size here
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(1))?, 3); // CORRECT: 0 (or error?)
|
||||
|
||||
assert_eq!(tline.get_relsize_exists(TESTREL_A, Lsn(2))?, true);
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(2))?, 3); // CORRECT: 1
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(5))?, 3);
|
||||
|
||||
// Check page contents at each LSN
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(2))?,
|
||||
TEST_IMG("foo blk 0 at 2")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(3))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(4))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(5))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(5))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
tline.put_truncation(TESTREL_A, Lsn(6), 2)?;
|
||||
tline.advance_last_valid_lsn(Lsn(6));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(6))?, 2);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(6))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(6))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
// should still see the truncated block with older LSN
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(5))?, 2); // CORRECT: 3
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
///
|
||||
/// This isn't very interesting with the RocksDb implementation, as we don't pay
|
||||
/// any attention to Postgres segment boundaries there.
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
|
||||
let mut lsn = 0;
|
||||
for i in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
|
||||
lsn += 1;
|
||||
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img);
|
||||
}
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
|
||||
assert_eq!(
|
||||
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE + 1
|
||||
);
|
||||
|
||||
// Truncate one block
|
||||
lsn += 1;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE
|
||||
);
|
||||
|
||||
// Truncate another block
|
||||
lsn += 1;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE - 1
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager {}
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for rel {} blk {} to get to {}, with {} and {} records",
|
||||
tag.rel,
|
||||
tag.blknum,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{}", s);
|
||||
Ok(TEST_IMG(&s))
|
||||
}
|
||||
}
|
||||
}
|
||||
978
pageserver/src/repository/rocksdb.rs
Normal file
978
pageserver/src/repository/rocksdb.rs
Normal file
@@ -0,0 +1,978 @@
|
||||
//
|
||||
// A Repository holds all the different page versions and WAL records
|
||||
//
|
||||
// This implementation uses RocksDB to store WAL wal records and
|
||||
// full page images, keyed by the RelFileNode, blocknumber, and the
|
||||
// LSN.
|
||||
|
||||
use crate::repository::{BufferTag, RelTag, Repository, Timeline, WALRecord};
|
||||
use crate::restore_local_repo::restore_timeline;
|
||||
use crate::waldecoder::{Oid, TransactionId};
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
// use crate::PageServerConf;
|
||||
// use crate::branches;
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use postgres_ffi::nonrelfile_utils::transaction_id_get_status;
|
||||
use postgres_ffi::*;
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::thread;
|
||||
use std::time::{Duration, Instant};
|
||||
use zenith_utils::lsn::{AtomicLsn, Lsn};
|
||||
use zenith_utils::seqwait::SeqWait;
|
||||
|
||||
// Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
static TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
pub struct RocksRepository {
|
||||
conf: &'static PageServerConf,
|
||||
timelines: Mutex<HashMap<ZTimelineId, Arc<RocksTimeline>>>,
|
||||
|
||||
walredo_mgr: Arc<dyn WalRedoManager>,
|
||||
}
|
||||
|
||||
pub struct RocksTimeline {
|
||||
// RocksDB handle
|
||||
db: rocksdb::DB,
|
||||
|
||||
// WAL redo manager
|
||||
walredo_mgr: Arc<dyn WalRedoManager>,
|
||||
|
||||
// What page versions do we hold in the cache? If we get a request > last_valid_lsn,
|
||||
// we need to wait until we receive all the WAL up to the request. The SeqWait
|
||||
// provides functions for that. TODO: If we get a request for an old LSN, such that
|
||||
// the versions have already been garbage collected away, we should throw an error,
|
||||
// but we don't track that currently.
|
||||
//
|
||||
// last_record_lsn points to the end of last processed WAL record.
|
||||
// It can lag behind last_valid_lsn, if the WAL receiver has received some WAL
|
||||
// after the end of last record, but not the whole next record yet. In the
|
||||
// page cache, we care about last_valid_lsn, but if the WAL receiver needs to
|
||||
// restart the streaming, it needs to restart at the end of last record, so
|
||||
// we track them separately. last_record_lsn should perhaps be in
|
||||
// walreceiver.rs instead of here, but it seems convenient to keep all three
|
||||
// values together.
|
||||
//
|
||||
last_valid_lsn: SeqWait<Lsn>,
|
||||
last_record_lsn: AtomicLsn,
|
||||
|
||||
// Counters, for metrics collection.
|
||||
pub num_entries: AtomicU64,
|
||||
pub num_page_images: AtomicU64,
|
||||
pub num_wal_records: AtomicU64,
|
||||
pub num_getpage_requests: AtomicU64,
|
||||
}
|
||||
|
||||
//
|
||||
// We store two kinds of entries in the repository:
|
||||
//
|
||||
// 1. Ready-made images of the block
|
||||
// 2. WAL records, to be applied on top of the "previous" entry
|
||||
//
|
||||
// Some WAL records will initialize the page from scratch. For such records,
|
||||
// the 'will_init' flag is set. They don't need the previous page image before
|
||||
// applying. The 'will_init' flag is set for records containing a full-page image,
|
||||
// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
|
||||
// stored directly in the cache entry in that you still need to run the WAL redo
|
||||
// routine to generate the page image.
|
||||
//
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
struct CacheKey {
|
||||
pub tag: BufferTag,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl CacheKey {
|
||||
fn pack(&self, buf: &mut BytesMut) {
|
||||
self.tag.pack(buf);
|
||||
buf.put_u64(self.lsn.0);
|
||||
}
|
||||
fn unpack(buf: &mut Bytes) -> CacheKey {
|
||||
CacheKey {
|
||||
tag: BufferTag::unpack(buf),
|
||||
lsn: Lsn::from(buf.get_u64()),
|
||||
}
|
||||
}
|
||||
|
||||
fn from_slice(slice: &[u8]) -> Self {
|
||||
let mut buf = Bytes::copy_from_slice(slice);
|
||||
Self::unpack(&mut buf)
|
||||
}
|
||||
|
||||
fn to_bytes(&self) -> BytesMut {
|
||||
let mut buf = BytesMut::new();
|
||||
self.pack(&mut buf);
|
||||
buf
|
||||
}
|
||||
}
|
||||
|
||||
enum CacheEntryContent {
|
||||
PageImage(Bytes),
|
||||
WALRecord(WALRecord),
|
||||
Truncation,
|
||||
}
|
||||
|
||||
// The serialized representation of a CacheEntryContent begins with
|
||||
// single byte that indicates what kind of entry it is. There is also
|
||||
// an UNUSED_VERSION_FLAG that is not represented in the CacheEntryContent
|
||||
// at all, you must peek into the first byte of the serialized representation
|
||||
// to read it.
|
||||
const CONTENT_PAGE_IMAGE: u8 = 1u8;
|
||||
const CONTENT_WAL_RECORD: u8 = 2u8;
|
||||
const CONTENT_TRUNCATION: u8 = 3u8;
|
||||
|
||||
const CONTENT_KIND_MASK: u8 = 3u8; // bitmask that covers the above
|
||||
|
||||
const UNUSED_VERSION_FLAG: u8 = 4u8;
|
||||
|
||||
impl CacheEntryContent {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
match self {
|
||||
CacheEntryContent::PageImage(image) => {
|
||||
buf.put_u8(CONTENT_PAGE_IMAGE);
|
||||
buf.put_u16(image.len() as u16);
|
||||
buf.put_slice(&image[..]);
|
||||
}
|
||||
CacheEntryContent::WALRecord(rec) => {
|
||||
buf.put_u8(CONTENT_WAL_RECORD);
|
||||
rec.pack(buf);
|
||||
}
|
||||
CacheEntryContent::Truncation => {
|
||||
buf.put_u8(CONTENT_TRUNCATION);
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> CacheEntryContent {
|
||||
let kind = buf.get_u8() & CONTENT_KIND_MASK;
|
||||
|
||||
match kind {
|
||||
CONTENT_PAGE_IMAGE => {
|
||||
let len = buf.get_u16() as usize;
|
||||
let mut dst = vec![0u8; len];
|
||||
buf.copy_to_slice(&mut dst);
|
||||
CacheEntryContent::PageImage(Bytes::from(dst))
|
||||
}
|
||||
CONTENT_WAL_RECORD => CacheEntryContent::WALRecord(WALRecord::unpack(buf)),
|
||||
CONTENT_TRUNCATION => CacheEntryContent::Truncation,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
fn from_slice(slice: &[u8]) -> Self {
|
||||
let mut buf = Bytes::copy_from_slice(slice);
|
||||
Self::unpack(&mut buf)
|
||||
}
|
||||
|
||||
fn to_bytes(&self) -> BytesMut {
|
||||
let mut buf = BytesMut::new();
|
||||
self.pack(&mut buf);
|
||||
buf
|
||||
}
|
||||
}
|
||||
|
||||
impl RocksRepository {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
walredo_mgr: Arc<dyn WalRedoManager>,
|
||||
) -> RocksRepository {
|
||||
RocksRepository {
|
||||
conf: conf,
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
walredo_mgr,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get handle to a given timeline. It is assumed to already exist.
|
||||
impl Repository for RocksRepository {
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
|
||||
match timelines.get(&timelineid) {
|
||||
Some(timeline) => Ok(timeline.clone()),
|
||||
None => bail!("timeline not found"),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_or_restore_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
match timelines.get(&timelineid) {
|
||||
Some(timeline) => Ok(timeline.clone()),
|
||||
None => {
|
||||
let timeline = RocksTimeline::new(self.conf, timelineid, self.walredo_mgr.clone());
|
||||
|
||||
restore_timeline(self.conf, &timeline, timelineid)?;
|
||||
|
||||
let timeline_rc = Arc::new(timeline);
|
||||
|
||||
timelines.insert(timelineid, timeline_rc.clone());
|
||||
|
||||
if self.conf.gc_horizon != 0 {
|
||||
let timeline_rc_copy = timeline_rc.clone();
|
||||
let conf = self.conf;
|
||||
let _gc_thread = thread::Builder::new()
|
||||
.name("Garbage collection thread".into())
|
||||
.spawn(move || {
|
||||
// FIXME
|
||||
timeline_rc_copy.do_gc(conf).expect("GC thread died");
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
Ok(timeline_rc)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
let timeline = RocksTimeline::new(&self.conf, timelineid, self.walredo_mgr.clone());
|
||||
|
||||
let timeline_rc = Arc::new(timeline);
|
||||
let r = timelines.insert(timelineid, timeline_rc.clone());
|
||||
assert!(r.is_none());
|
||||
|
||||
// don't start the garbage collector for unit tests, either.
|
||||
|
||||
Ok(timeline_rc)
|
||||
}
|
||||
}
|
||||
|
||||
impl RocksTimeline {
|
||||
fn open_rocksdb(conf: &PageServerConf, timelineid: ZTimelineId) -> rocksdb::DB {
|
||||
let path = conf.timeline_path(timelineid);
|
||||
let mut opts = rocksdb::Options::default();
|
||||
opts.create_if_missing(true);
|
||||
opts.set_use_fsync(true);
|
||||
opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
|
||||
opts.set_compaction_filter("ttl", move |_level: u32, _key: &[u8], val: &[u8]| {
|
||||
if (val[0] & UNUSED_VERSION_FLAG) != 0 {
|
||||
rocksdb::compaction_filter::Decision::Remove
|
||||
} else {
|
||||
rocksdb::compaction_filter::Decision::Keep
|
||||
}
|
||||
});
|
||||
rocksdb::DB::open(&opts, &path).unwrap()
|
||||
}
|
||||
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
walredo_mgr: Arc<dyn WalRedoManager>,
|
||||
) -> RocksTimeline {
|
||||
RocksTimeline {
|
||||
db: RocksTimeline::open_rocksdb(conf, timelineid),
|
||||
|
||||
walredo_mgr,
|
||||
|
||||
last_valid_lsn: SeqWait::new(Lsn(0)),
|
||||
last_record_lsn: AtomicLsn::new(0),
|
||||
|
||||
num_entries: AtomicU64::new(0),
|
||||
num_page_images: AtomicU64::new(0),
|
||||
num_wal_records: AtomicU64::new(0),
|
||||
num_getpage_requests: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RocksTimeline {
|
||||
///
|
||||
/// Collect all the WAL records that are needed to reconstruct a page
|
||||
/// image for the given cache entry.
|
||||
///
|
||||
/// Returns an old page image (if any), and a vector of WAL records to apply
|
||||
/// over it.
|
||||
///
|
||||
fn collect_records_for_apply(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
) -> (Option<Bytes>, Vec<WALRecord>) {
|
||||
let key = CacheKey { tag, lsn };
|
||||
let mut base_img: Option<Bytes> = None;
|
||||
let mut records: Vec<WALRecord> = Vec::new();
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(key.to_bytes());
|
||||
|
||||
// Scan backwards, collecting the WAL records, until we hit an
|
||||
// old page image.
|
||||
while iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag != tag {
|
||||
break;
|
||||
}
|
||||
let content = CacheEntryContent::from_slice(iter.value().unwrap());
|
||||
if let CacheEntryContent::PageImage(img) = content {
|
||||
// We have a base image. No need to dig deeper into the list of
|
||||
// records
|
||||
base_img = Some(img);
|
||||
break;
|
||||
} else if let CacheEntryContent::WALRecord(rec) = content {
|
||||
records.push(rec.clone());
|
||||
// If this WAL record initializes the page, no need to dig deeper.
|
||||
if rec.will_init {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
panic!("no base image and no WAL record on cache entry");
|
||||
}
|
||||
iter.prev();
|
||||
}
|
||||
records.reverse();
|
||||
(base_img, records)
|
||||
}
|
||||
|
||||
// Internal functions
|
||||
|
||||
//
|
||||
// Internal function to get relation size at given LSN.
|
||||
//
|
||||
// The caller must ensure that WAL has been received up to 'lsn'.
|
||||
//
|
||||
fn relsize_get_nowait(&self, rel: RelTag, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn <= self.last_valid_lsn.load());
|
||||
|
||||
let mut key = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel,
|
||||
blknum: u32::MAX,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
loop {
|
||||
iter.seek_for_prev(key.to_bytes());
|
||||
if iter.valid() {
|
||||
let thiskey = CacheKey::from_slice(iter.key().unwrap());
|
||||
if thiskey.tag.rel == rel {
|
||||
let content = CacheEntryContent::from_slice(iter.value().unwrap());
|
||||
if let CacheEntryContent::Truncation = content {
|
||||
if thiskey.tag.blknum > 0 {
|
||||
key.tag.blknum = thiskey.tag.blknum - 1;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
let relsize = thiskey.tag.blknum + 1;
|
||||
debug!("Size of relation {} at {} is {}", rel, lsn, relsize);
|
||||
return Ok(relsize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
debug!("Size of relation {} at {} is zero", rel, lsn);
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
fn do_gc(&self, conf: &'static PageServerConf) -> Result<Bytes> {
|
||||
loop {
|
||||
thread::sleep(conf.gc_period);
|
||||
let last_lsn = self.get_last_valid_lsn();
|
||||
|
||||
// checked_sub() returns None on overflow.
|
||||
if let Some(horizon) = last_lsn.checked_sub(conf.gc_horizon) {
|
||||
let mut maxkey = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: u32::MAX,
|
||||
dbnode: u32::MAX,
|
||||
relnode: u32::MAX,
|
||||
forknum: u8::MAX,
|
||||
},
|
||||
blknum: u32::MAX,
|
||||
},
|
||||
lsn: Lsn::MAX,
|
||||
};
|
||||
let now = Instant::now();
|
||||
let mut reconstructed = 0u64;
|
||||
let mut truncated = 0u64;
|
||||
let mut inspected = 0u64;
|
||||
let mut deleted = 0u64;
|
||||
loop {
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(maxkey.to_bytes());
|
||||
if iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
let v = iter.value().unwrap();
|
||||
|
||||
inspected += 1;
|
||||
|
||||
// Construct boundaries for old records cleanup
|
||||
maxkey.tag = key.tag;
|
||||
let last_lsn = key.lsn;
|
||||
maxkey.lsn = min(horizon, last_lsn); // do not remove last version
|
||||
|
||||
let mut minkey = maxkey.clone();
|
||||
minkey.lsn = Lsn(0); // first version
|
||||
|
||||
// Special handling of delete of PREPARE WAL record
|
||||
if last_lsn < horizon
|
||||
&& key.tag.rel.forknum == pg_constants::PG_TWOPHASE_FORKNUM
|
||||
{
|
||||
if (v[0] & UNUSED_VERSION_FLAG) == 0 {
|
||||
let mut v = v.to_owned();
|
||||
v[0] |= UNUSED_VERSION_FLAG;
|
||||
self.db.put(key.to_bytes(), &v[..])?;
|
||||
deleted += 1;
|
||||
}
|
||||
maxkey = minkey;
|
||||
continue;
|
||||
}
|
||||
// reconstruct most recent page version
|
||||
if (v[0] & CONTENT_KIND_MASK) == CONTENT_WAL_RECORD {
|
||||
// force reconstruction of most recent page version
|
||||
let (base_img, records) =
|
||||
self.collect_records_for_apply(key.tag, key.lsn);
|
||||
|
||||
trace!(
|
||||
"Reconstruct most recent page {} blk {} at {} from {} records",
|
||||
key.tag.rel,
|
||||
key.tag.blknum,
|
||||
key.lsn,
|
||||
records.len()
|
||||
);
|
||||
|
||||
let new_img = self
|
||||
.walredo_mgr
|
||||
.request_redo(key.tag, key.lsn, base_img, records)?;
|
||||
self.put_page_image(key.tag, key.lsn, new_img.clone());
|
||||
|
||||
reconstructed += 1;
|
||||
}
|
||||
|
||||
iter.seek_for_prev(maxkey.to_bytes());
|
||||
if iter.valid() {
|
||||
// do not remove last version
|
||||
if last_lsn > horizon {
|
||||
// locate most recent record before horizon
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag == maxkey.tag {
|
||||
let v = iter.value().unwrap();
|
||||
if (v[0] & CONTENT_KIND_MASK) == CONTENT_WAL_RECORD {
|
||||
let (base_img, records) =
|
||||
self.collect_records_for_apply(key.tag, key.lsn);
|
||||
trace!("Reconstruct horizon page {} blk {} at {} from {} records",
|
||||
key.tag.rel, key.tag.blknum, key.lsn, records.len());
|
||||
let new_img = self
|
||||
.walredo_mgr
|
||||
.request_redo(key.tag, key.lsn, base_img, records)?;
|
||||
self.put_page_image(key.tag, key.lsn, new_img.clone());
|
||||
|
||||
truncated += 1;
|
||||
} else {
|
||||
trace!(
|
||||
"Keeping horizon page {} blk {} at {}",
|
||||
key.tag.rel,
|
||||
key.tag.blknum,
|
||||
key.lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
trace!(
|
||||
"Last page {} blk {} at {}, horizon {}",
|
||||
key.tag.rel,
|
||||
key.tag.blknum,
|
||||
key.lsn,
|
||||
horizon
|
||||
);
|
||||
}
|
||||
// remove records prior to horizon
|
||||
loop {
|
||||
iter.prev();
|
||||
if !iter.valid() {
|
||||
break;
|
||||
}
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag != maxkey.tag {
|
||||
break;
|
||||
}
|
||||
let v = iter.value().unwrap();
|
||||
if (v[0] & UNUSED_VERSION_FLAG) == 0 {
|
||||
let mut v = v.to_owned();
|
||||
v[0] |= UNUSED_VERSION_FLAG;
|
||||
self.db.put(key.to_bytes(), &v[..])?;
|
||||
deleted += 1;
|
||||
trace!(
|
||||
"deleted: {} blk {} at {}",
|
||||
key.tag.rel,
|
||||
key.tag.blknum,
|
||||
key.lsn
|
||||
);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
maxkey = minkey;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
info!("Garbage collection completed in {:?}:\n{} version chains inspected, {} pages reconstructed, {} version histories truncated, {} versions deleted",
|
||||
now.elapsed(), inspected, reconstructed, truncated, deleted);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Wait until WAL has been received up to the given LSN.
|
||||
//
|
||||
fn wait_lsn(&self, mut lsn: Lsn) -> Result<Lsn> {
|
||||
// When invalid LSN is requested, it means "don't wait, return latest version of the page"
|
||||
// This is necessary for bootstrap.
|
||||
if lsn == Lsn(0) {
|
||||
let last_valid_lsn = self.last_valid_lsn.load();
|
||||
trace!(
|
||||
"walreceiver doesn't work yet last_valid_lsn {}, requested {}",
|
||||
last_valid_lsn,
|
||||
lsn
|
||||
);
|
||||
lsn = last_valid_lsn;
|
||||
}
|
||||
//trace!("Start waiting for LSN {}, valid LSN is {}", lsn, self.last_valid_lsn.load());
|
||||
self.last_valid_lsn
|
||||
.wait_for_timeout(lsn, TIMEOUT)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive",
|
||||
lsn
|
||||
)
|
||||
})?;
|
||||
//trace!("Stop waiting for LSN {}, valid LSN is {}", lsn, self.last_valid_lsn.load());
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline for RocksTimeline {
|
||||
// Public GET interface functions
|
||||
|
||||
///
|
||||
/// GetPage@LSN
|
||||
///
|
||||
/// Returns an 8k page image
|
||||
///
|
||||
fn get_page_at_lsn(&self, tag: BufferTag, req_lsn: Lsn) -> Result<Bytes> {
|
||||
self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let lsn = self.wait_lsn(req_lsn)?;
|
||||
|
||||
// Look up cache entry. If it's a page image, return that. If it's a WAL record,
|
||||
// ask the WAL redo service to reconstruct the page image from the WAL records.
|
||||
let key = CacheKey { tag, lsn };
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(key.to_bytes());
|
||||
|
||||
if iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag == tag {
|
||||
let content = CacheEntryContent::from_slice(iter.value().unwrap());
|
||||
let page_img: Bytes;
|
||||
if let CacheEntryContent::PageImage(img) = content {
|
||||
page_img = img;
|
||||
} else if let CacheEntryContent::WALRecord(_rec) = content {
|
||||
// Request the WAL redo manager to apply the WAL records for us.
|
||||
let (base_img, records) = self.collect_records_for_apply(tag, lsn);
|
||||
page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;
|
||||
|
||||
self.put_page_image(tag, lsn, page_img.clone());
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
// FIXME: assumes little-endian. Only used for the debugging log though
|
||||
let page_lsn_hi =
|
||||
u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
|
||||
let page_lsn_lo =
|
||||
u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
|
||||
debug!(
|
||||
"Returning page with LSN {:X}/{:X} for {} blk {}",
|
||||
page_lsn_hi, page_lsn_lo, tag.rel, tag.blknum
|
||||
);
|
||||
return Ok(page_img);
|
||||
}
|
||||
}
|
||||
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
debug!(
|
||||
"Page {} blk {} at {}({}) not found",
|
||||
tag.rel, tag.blknum, req_lsn, lsn
|
||||
);
|
||||
Ok(Bytes::from_static(&ZERO_PAGE))
|
||||
/* return Err("could not find page image")?; */
|
||||
}
|
||||
|
||||
///
|
||||
/// Get size of relation at given LSN.
|
||||
///
|
||||
fn get_relsize(&self, rel: RelTag, lsn: Lsn) -> Result<u32> {
|
||||
let lsn = self.wait_lsn(lsn)?;
|
||||
self.relsize_get_nowait(rel, lsn)
|
||||
}
|
||||
|
||||
/// Get vector of prepared twophase transactions
|
||||
fn get_twophase(&self, lsn: Lsn) -> Result<Vec<TransactionId>> {
|
||||
let key = CacheKey {
|
||||
// minimal key
|
||||
tag: BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::PG_TWOPHASE_FORKNUM,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
},
|
||||
blknum: 0,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let mut gxacts = Vec::new();
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(key.to_bytes());
|
||||
while iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag.rel.forknum != pg_constants::PG_TWOPHASE_FORKNUM {
|
||||
break; // we are done with this fork
|
||||
}
|
||||
if key.lsn <= lsn {
|
||||
let xid = key.tag.blknum;
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::PG_XACT_FORKNUM,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
},
|
||||
blknum: xid / pg_constants::CLOG_XACTS_PER_PAGE,
|
||||
};
|
||||
let clog_page = self.get_page_at_lsn(tag, lsn)?;
|
||||
let status = transaction_id_get_status(xid, &clog_page[..]);
|
||||
if status == pg_constants::TRANSACTION_STATUS_IN_PROGRESS {
|
||||
gxacts.push(xid);
|
||||
}
|
||||
}
|
||||
iter.next();
|
||||
}
|
||||
return Ok(gxacts);
|
||||
}
|
||||
|
||||
/// Get databases. This function is used to local pg_filenode.map files
|
||||
fn get_databases(&self, lsn: Lsn) -> Result<Vec<RelTag>> {
|
||||
let key = CacheKey {
|
||||
// minimal key
|
||||
tag: BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
},
|
||||
blknum: 0,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let mut dbs = Vec::new();
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(key.to_bytes());
|
||||
let mut prev_tag = key.tag.rel;
|
||||
while iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag.rel.forknum != pg_constants::PG_FILENODEMAP_FORKNUM {
|
||||
break; // we are done with this fork
|
||||
}
|
||||
if key.tag.rel != prev_tag && key.lsn <= lsn {
|
||||
prev_tag = key.tag.rel;
|
||||
dbs.push(prev_tag); // collect unique tags
|
||||
}
|
||||
iter.next();
|
||||
}
|
||||
return Ok(dbs);
|
||||
}
|
||||
|
||||
/// Get range [begin,end) of stored blocks. Used mostly for SMGR pseudorelations
|
||||
/// but can be also applied to normal relations.
|
||||
fn get_range(&self, rel: RelTag, lsn: Lsn) -> Result<(u32, u32)> {
|
||||
let _lsn = self.wait_lsn(lsn)?;
|
||||
let mut key = CacheKey {
|
||||
// minimal key to start with
|
||||
tag: BufferTag { rel, blknum: 0 },
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(key.to_bytes()); // locate first entry
|
||||
if iter.valid() {
|
||||
let thiskey = CacheKey::from_slice(iter.key().unwrap());
|
||||
let tag = thiskey.tag;
|
||||
if tag.rel == rel {
|
||||
// still trversing this relation
|
||||
let first_blknum = tag.blknum;
|
||||
key.tag.blknum = u32::MAX; // maximal key
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(key.to_bytes()); // localte last entry
|
||||
if iter.valid() {
|
||||
let thiskey = CacheKey::from_slice(iter.key().unwrap());
|
||||
let last_blknum = thiskey.tag.blknum;
|
||||
return Ok((first_blknum, last_blknum + 1)); // upper boundary is exclusive
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok((0, 0)) // empty range
|
||||
}
|
||||
|
||||
///
|
||||
/// Does relation exist at given LSN?
|
||||
///
|
||||
/// FIXME: this actually returns true, if the relation exists at *any* LSN
|
||||
fn get_relsize_exists(&self, rel: RelTag, req_lsn: Lsn) -> Result<bool> {
|
||||
let lsn = self.wait_lsn(req_lsn)?;
|
||||
|
||||
let key = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel,
|
||||
blknum: u32::MAX,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(key.to_bytes());
|
||||
if iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag.rel == rel {
|
||||
debug!("Relation {} exists at {}", rel, lsn);
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
debug!("Relation {} doesn't exist at {}", rel, lsn);
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
// Other public functions, for updating the repository.
|
||||
// These are used by the WAL receiver and WAL redo.
|
||||
|
||||
///
|
||||
/// Adds a WAL record to the repository
|
||||
///
|
||||
fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) {
|
||||
let lsn = rec.lsn;
|
||||
let key = CacheKey { tag, lsn };
|
||||
|
||||
let content = CacheEntryContent::WALRecord(rec);
|
||||
|
||||
let _res = self.db.put(key.to_bytes(), content.to_bytes());
|
||||
trace!(
|
||||
"put_wal_record rel {} blk {} at {}",
|
||||
tag.rel,
|
||||
tag.blknum,
|
||||
lsn
|
||||
);
|
||||
|
||||
self.num_entries.fetch_add(1, Ordering::Relaxed);
|
||||
self.num_wal_records.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
///
|
||||
/// Adds a relation-wide WAL record (like truncate) to the repository,
|
||||
/// associating it with all pages started with specified block number
|
||||
///
|
||||
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()> {
|
||||
// What was the size of the relation before this record?
|
||||
let last_lsn = self.last_valid_lsn.load();
|
||||
let old_rel_size = self.relsize_get_nowait(rel, last_lsn)?;
|
||||
|
||||
let content = CacheEntryContent::Truncation;
|
||||
// set new relation size
|
||||
trace!("Truncate relation {} to {} blocks at {}", rel, nblocks, lsn);
|
||||
|
||||
for blknum in nblocks..old_rel_size {
|
||||
let key = CacheKey {
|
||||
tag: BufferTag { rel, blknum },
|
||||
lsn,
|
||||
};
|
||||
trace!("put_wal_record lsn: {}", key.lsn);
|
||||
let _res = self.db.put(key.to_bytes(), content.to_bytes());
|
||||
}
|
||||
let n = (old_rel_size - nblocks) as u64;
|
||||
self.num_entries.fetch_add(n, Ordering::Relaxed);
|
||||
self.num_wal_records.fetch_add(n, Ordering::Relaxed);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Get page image at particular LSN
|
||||
///
|
||||
fn get_page_image(&self, tag: BufferTag, lsn: Lsn) -> Result<Option<Bytes>> {
|
||||
let key = CacheKey { tag, lsn };
|
||||
if let Some(bytes) = self.db.get(key.to_bytes())? {
|
||||
let content = CacheEntryContent::from_slice(&bytes);
|
||||
if let CacheEntryContent::PageImage(img) = content {
|
||||
return Ok(Some(img));
|
||||
}
|
||||
}
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
///
|
||||
/// Memorize a full image of a page version
|
||||
///
|
||||
fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes) {
|
||||
let img_len = img.len();
|
||||
let key = CacheKey { tag, lsn };
|
||||
let content = CacheEntryContent::PageImage(img);
|
||||
|
||||
let mut val_buf = content.to_bytes();
|
||||
|
||||
// Zero size of page image indicates that page can be removed
|
||||
if img_len == 0 {
|
||||
if (val_buf[0] & UNUSED_VERSION_FLAG) != 0 {
|
||||
// records already marked for deletion
|
||||
return;
|
||||
} else {
|
||||
// delete truncated multixact page
|
||||
val_buf[0] |= UNUSED_VERSION_FLAG;
|
||||
}
|
||||
}
|
||||
|
||||
trace!("put_wal_record lsn: {}", key.lsn);
|
||||
let _res = self.db.put(key.to_bytes(), content.to_bytes());
|
||||
|
||||
trace!(
|
||||
"put_page_image rel {} blk {} at {}",
|
||||
tag.rel,
|
||||
tag.blknum,
|
||||
lsn
|
||||
);
|
||||
self.num_page_images.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn put_create_database(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
db_id: Oid,
|
||||
tablespace_id: Oid,
|
||||
src_db_id: Oid,
|
||||
src_tablespace_id: Oid,
|
||||
) -> Result<()> {
|
||||
let mut n = 0;
|
||||
for forknum in &[
|
||||
pg_constants::MAIN_FORKNUM,
|
||||
pg_constants::FSM_FORKNUM,
|
||||
pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
pg_constants::INIT_FORKNUM,
|
||||
pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
] {
|
||||
let key = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: src_tablespace_id,
|
||||
dbnode: src_db_id,
|
||||
relnode: 0,
|
||||
forknum: *forknum,
|
||||
},
|
||||
blknum: 0,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(key.to_bytes());
|
||||
while iter.valid() {
|
||||
let mut key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag.rel.spcnode != src_tablespace_id || key.tag.rel.dbnode != src_db_id {
|
||||
break;
|
||||
}
|
||||
key.tag.rel.spcnode = tablespace_id;
|
||||
key.tag.rel.dbnode = db_id;
|
||||
key.lsn = lsn;
|
||||
|
||||
let v = iter.value().unwrap();
|
||||
self.db.put(key.to_bytes(), v)?;
|
||||
n += 1;
|
||||
iter.next();
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Create database {}/{}, copy {} entries",
|
||||
tablespace_id, db_id, n
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember that WAL has been received and added to the timeline up to the given LSN
|
||||
fn advance_last_valid_lsn(&self, lsn: Lsn) {
|
||||
let lsn = Lsn((lsn.0 + 7) & !7); // align position on 8 bytes
|
||||
let old = self.last_valid_lsn.advance(lsn);
|
||||
|
||||
// Can't move backwards.
|
||||
if lsn < old {
|
||||
warn!(
|
||||
"attempted to move last valid LSN backwards (was {}, new {})",
|
||||
old, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Remember the (end of) last valid WAL record remembered for the timeline.
|
||||
///
|
||||
/// NOTE: this updates last_valid_lsn as well.
|
||||
///
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn) {
|
||||
let lsn = Lsn((lsn.0 + 7) & !7); // align position on 8 bytes
|
||||
// Can't move backwards.
|
||||
let old = self.last_record_lsn.fetch_max(lsn);
|
||||
assert!(old <= lsn);
|
||||
|
||||
// Also advance last_valid_lsn
|
||||
let old = self.last_valid_lsn.advance(lsn);
|
||||
// Can't move backwards.
|
||||
if lsn < old {
|
||||
warn!(
|
||||
"attempted to move last record LSN backwards (was {}, new {})",
|
||||
old, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_last_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load()
|
||||
}
|
||||
|
||||
fn init_valid_lsn(&self, lsn: Lsn) {
|
||||
let old = self.last_valid_lsn.advance(lsn);
|
||||
assert!(old == Lsn(0));
|
||||
let old = self.last_record_lsn.fetch_max(lsn);
|
||||
assert!(old == Lsn(0));
|
||||
}
|
||||
|
||||
fn get_last_valid_lsn(&self) -> Lsn {
|
||||
self.last_valid_lsn.load()
|
||||
}
|
||||
|
||||
//
|
||||
// Get statistics to be displayed in the user interface.
|
||||
//
|
||||
// FIXME
|
||||
/*
|
||||
fn get_stats(&self) -> TimelineStats {
|
||||
TimelineStats {
|
||||
num_entries: self.num_entries.load(Ordering::Relaxed),
|
||||
num_page_images: self.num_page_images.load(Ordering::Relaxed),
|
||||
num_wal_records: self.num_wal_records.load(Ordering::Relaxed),
|
||||
num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
483
pageserver/src/restore_local_repo.rs
Normal file
483
pageserver/src/restore_local_repo.rs
Normal file
@@ -0,0 +1,483 @@
|
||||
//
|
||||
// Restore chunks from local Zenith repository
|
||||
//
|
||||
// This runs once at Page Server startup. It loads all the "snapshots" and all
|
||||
// WAL from all timelines from the local zenith repository into the in-memory page
|
||||
// cache.
|
||||
//
|
||||
// This also initializes the "last valid LSN" in the page cache to the last LSN
|
||||
// seen in the WAL, so that when the WAL receiver is started, it starts
|
||||
// streaming from that LSN.
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use std::cmp::max;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::io::Seek;
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
|
||||
use crate::repository::{BufferTag, RelTag, Timeline};
|
||||
use crate::waldecoder::{decode_wal_record, Oid, WalStreamDecoder};
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Load all WAL and all relation data pages from local disk into the repository.
|
||||
///
|
||||
pub fn restore_timeline(
|
||||
conf: &PageServerConf,
|
||||
timeline: &dyn Timeline,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<()> {
|
||||
let timelinepath = PathBuf::from("timelines").join(timelineid.to_string());
|
||||
|
||||
if !timelinepath.exists() {
|
||||
anyhow::bail!("timeline {} does not exist in the page server's repository");
|
||||
}
|
||||
|
||||
// Scan .zenith/timelines/<timeline>/snapshots
|
||||
let snapshotspath = PathBuf::from("timelines")
|
||||
.join(timelineid.to_string())
|
||||
.join("snapshots");
|
||||
|
||||
let mut last_snapshot_lsn: Lsn = Lsn(0);
|
||||
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let direntry = direntry?;
|
||||
let filename = direntry.file_name();
|
||||
let lsn = Lsn::from_filename(&filename)?;
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
|
||||
// FIXME: pass filename as Path instead of str?
|
||||
let filename_str = filename.into_string().unwrap();
|
||||
restore_snapshot(conf, timeline, timelineid, &filename_str)?;
|
||||
info!("restored snapshot at {:?}", filename_str);
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == Lsn(0) {
|
||||
error!(
|
||||
"could not find valid snapshot in {}",
|
||||
snapshotspath.display()
|
||||
);
|
||||
// TODO return error?
|
||||
}
|
||||
timeline.init_valid_lsn(last_snapshot_lsn);
|
||||
|
||||
restore_wal(timeline, timelineid, last_snapshot_lsn)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Find latest snapshot in a timeline's 'snapshots' directory
|
||||
///
|
||||
pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result<Lsn> {
|
||||
let snapshotspath = format!("timelines/{}/snapshots", timeline);
|
||||
|
||||
let mut last_snapshot_lsn = Lsn(0);
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let filename = direntry.unwrap().file_name();
|
||||
|
||||
if let Ok(lsn) = Lsn::from_filename(&filename) {
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
} else {
|
||||
error!("unrecognized file in snapshots directory: {:?}", filename);
|
||||
}
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == Lsn(0) {
|
||||
error!("could not find valid snapshot in {}", &snapshotspath);
|
||||
// TODO return error?
|
||||
}
|
||||
Ok(last_snapshot_lsn)
|
||||
}
|
||||
|
||||
fn restore_snapshot(
|
||||
conf: &PageServerConf,
|
||||
timeline: &dyn Timeline,
|
||||
timelineid: ZTimelineId,
|
||||
snapshot: &str,
|
||||
) -> Result<()> {
|
||||
let snapshotpath = PathBuf::from("timelines")
|
||||
.join(timelineid.to_string())
|
||||
.join("snapshots")
|
||||
.join(snapshot);
|
||||
|
||||
// Scan 'global'
|
||||
for direntry in fs::read_dir(snapshotpath.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("pg_control") => restore_nonrel_file(
|
||||
conf,
|
||||
timeline,
|
||||
timelineid,
|
||||
"0",
|
||||
0,
|
||||
0,
|
||||
pg_constants::PG_CONTROLFILE_FORKNUM,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
Some("pg_filenode.map") => restore_nonrel_file(
|
||||
conf,
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => restore_relfile(
|
||||
timeline,
|
||||
snapshot,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
}
|
||||
}
|
||||
|
||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
||||
// E.g. 'base/12345', where 12345 is the database OID.
|
||||
for direntry in fs::read_dir(snapshotpath.join("base"))? {
|
||||
let direntry = direntry?;
|
||||
|
||||
let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;
|
||||
|
||||
for direntry in fs::read_dir(direntry.path())? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => restore_nonrel_file(
|
||||
conf,
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => restore_relfile(
|
||||
timeline,
|
||||
snapshot,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
for entry in fs::read_dir(snapshotpath.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
restore_slru_file(
|
||||
conf,
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
pg_constants::PG_XACT_FORKNUM,
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
for entry in fs::read_dir(snapshotpath.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
restore_slru_file(
|
||||
conf,
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
pg_constants::PG_MXACT_MEMBERS_FORKNUM,
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
for entry in fs::read_dir(snapshotpath.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
restore_slru_file(
|
||||
conf,
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
pg_constants::PG_MXACT_OFFSETS_FORKNUM,
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
for entry in fs::read_dir(snapshotpath.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(&entry.path().to_str().unwrap(), 16)?;
|
||||
restore_nonrel_file(
|
||||
conf,
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
0,
|
||||
0,
|
||||
pg_constants::PG_TWOPHASE_FORKNUM,
|
||||
xid,
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_relfile(
|
||||
timeline: &dyn Timeline,
|
||||
snapshot: &str,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = Lsn::from_hex(snapshot)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
if let Err(e) = p {
|
||||
warn!("unrecognized file in snapshot: {:?} ({})", path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
let (relnode, forknum, segno) = p.unwrap();
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
|
||||
/*
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_nonrel_file(
|
||||
_conf: &PageServerConf,
|
||||
timeline: &dyn Timeline,
|
||||
_timelineid: ZTimelineId,
|
||||
snapshot: &str,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
forknum: u8,
|
||||
blknum: u32,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = Lsn::from_hex(snapshot)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode: 0,
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_slru_file(
|
||||
_conf: &PageServerConf,
|
||||
timeline: &dyn Timeline,
|
||||
_timelineid: ZTimelineId,
|
||||
snapshot: &str,
|
||||
forknum: u8,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = Lsn::from_hex(snapshot)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
|
||||
let mut blknum: u32 = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
|
||||
/*
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Scan WAL on a timeline, starting from given LSN, and load all the records
|
||||
// into the page cache.
|
||||
fn restore_wal(timeline: &dyn Timeline, timelineid: ZTimelineId, startpoint: Lsn) -> Result<()> {
|
||||
let walpath = format!("timelines/{}/wal", timelineid);
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = Lsn(0);
|
||||
|
||||
let mut checkpoint = CheckPoint::new(startpoint.0, 1);
|
||||
let checkpoint_tag = BufferTag::fork(pg_constants::PG_CHECKPOINT_FORKNUM);
|
||||
let pg_control_tag = BufferTag::fork(pg_constants::PG_CONTROLFILE_FORKNUM);
|
||||
if let Some(pg_control_bytes) = timeline.get_page_image(pg_control_tag, Lsn(0))? {
|
||||
let pg_control = decode_pg_control(pg_control_bytes)?;
|
||||
checkpoint = pg_control.checkPointCopy.clone();
|
||||
} else {
|
||||
error!("No control file is found in reposistory");
|
||||
}
|
||||
|
||||
loop {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut path = walpath.clone() + "/" + &filename;
|
||||
|
||||
// It could be as .partial
|
||||
if !PathBuf::from(&path).exists() {
|
||||
path += ".partial";
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
let open_result = File::open(&path);
|
||||
if let Err(e) = &open_result {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let mut file = open_result?;
|
||||
|
||||
if offset > 0 {
|
||||
file.seek(SeekFrom::Start(offset as u64))?;
|
||||
}
|
||||
|
||||
let mut buf = Vec::new();
|
||||
let nread = file.read_to_end(&mut buf)?;
|
||||
if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
|
||||
// Maybe allow this for .partial files?
|
||||
error!("read only {} bytes from WAL file", nread);
|
||||
}
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
loop {
|
||||
let rec = waldecoder.poll_decode();
|
||||
if rec.is_err() {
|
||||
// Assume that an error means we've reached the end of
|
||||
// a partial WAL record. So that's ok.
|
||||
trace!("WAL decoder error {:?}", rec);
|
||||
waldecoder.set_position(Lsn((segno + 1) * pg_constants::WAL_SEGMENT_SIZE as u64));
|
||||
break;
|
||||
}
|
||||
if let Some((lsn, recdata)) = rec.unwrap() {
|
||||
let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
|
||||
timeline.save_decoded_record(decoded, recdata, lsn)?;
|
||||
last_lsn = lsn;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
nrecords += 1;
|
||||
}
|
||||
|
||||
info!(
|
||||
"restored {} records from WAL file {} at {}",
|
||||
nrecords, filename, last_lsn
|
||||
);
|
||||
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
info!("reached end of WAL at {}", last_lsn);
|
||||
let checkpoint_bytes = encode_checkpoint(checkpoint);
|
||||
timeline.put_page_image(checkpoint_tag, Lsn(0), checkpoint_bytes);
|
||||
Ok(())
|
||||
}
|
||||
@@ -22,7 +22,9 @@ use tokio::runtime;
|
||||
|
||||
use futures::future;
|
||||
|
||||
use crate::{controlfile, page_cache, pg_constants, PageServerConf};
|
||||
use crate::{page_cache, PageServerConf};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
|
||||
struct Storage {
|
||||
region: Region,
|
||||
@@ -38,12 +40,9 @@ pub fn restore_main(conf: &PageServerConf) {
|
||||
let result = restore_chunk(conf).await;
|
||||
|
||||
match result {
|
||||
Ok(_) => {
|
||||
return;
|
||||
}
|
||||
Ok(_) => {}
|
||||
Err(err) => {
|
||||
error!("S3 error: {}", err);
|
||||
return;
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -60,8 +59,8 @@ pub fn restore_main(conf: &PageServerConf) {
|
||||
async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
let backend = Storage {
|
||||
region: Region::Custom {
|
||||
region: env::var("S3_REGION").unwrap().into(),
|
||||
endpoint: env::var("S3_ENDPOINT").unwrap().into(),
|
||||
region: env::var("S3_REGION").unwrap(),
|
||||
endpoint: env::var("S3_ENDPOINT").unwrap(),
|
||||
},
|
||||
credentials: Credentials::new(
|
||||
Some(&env::var("S3_ACCESSKEY").unwrap()),
|
||||
@@ -84,24 +83,8 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
.list("relationdata/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
|
||||
//Before uploading other files, slurp pg_control to set systemid
|
||||
|
||||
let control_results: Vec<s3::serde_types::ListBucketResult> = bucket
|
||||
.list(
|
||||
"relationdata/global/pg_control".to_string(),
|
||||
Some("".to_string()),
|
||||
)
|
||||
.await?;
|
||||
let object = &(&control_results[0]).contents[0];
|
||||
let (data, _) = bucket.get_object(&object.key).await.unwrap();
|
||||
let bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
let c = controlfile::decode_pg_control(bytes);
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), c.system_identifier);
|
||||
pcache.set_controldata(c.clone());
|
||||
trace!("uploaded controlfile {:?}", pcache.get_controldata());
|
||||
|
||||
let sys_id: u64 = c.system_identifier;
|
||||
// TODO: get that from backup
|
||||
let sys_id: u64 = 42;
|
||||
let mut oldest_lsn = 0;
|
||||
let mut slurp_futures: Vec<_> = Vec::new();
|
||||
|
||||
@@ -135,114 +118,35 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
panic!("no base backup found");
|
||||
}
|
||||
|
||||
//Now add nonrelation files
|
||||
let nonrelresults: Vec<s3::serde_types::ListBucketResult> = bucket
|
||||
.list("nonreldata/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
for result in nonrelresults {
|
||||
for object in result.contents {
|
||||
// Download needed non relation files, slurping them into memory
|
||||
|
||||
let key = object.key;
|
||||
let relpath = key.strip_prefix("nonreldata/").unwrap();
|
||||
trace!("list nonrelfiles {}", relpath);
|
||||
|
||||
let parsed = parse_nonrel_file_path(&relpath);
|
||||
|
||||
match parsed {
|
||||
Ok(p) => {
|
||||
let b = bucket.clone();
|
||||
let f = slurp_base_file(conf, sys_id, b, key.to_string(), p);
|
||||
|
||||
slurp_futures.push(f);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("unrecognized file: {} ({})", relpath, e);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
pcache.init_valid_lsn(oldest_lsn);
|
||||
|
||||
info!("{} files to restore...", slurp_futures.len());
|
||||
|
||||
future::join_all(slurp_futures).await;
|
||||
info!(
|
||||
"restored! {:?} to {:?}",
|
||||
pcache.first_valid_lsn, pcache.last_valid_lsn
|
||||
);
|
||||
info!("restored!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParsedBaseImageFileName {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u32,
|
||||
pub forknum: u8,
|
||||
pub segno: u32,
|
||||
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
fn parse_lsn_from_filename(fname: &str) -> Result<u64, FilePathError> {
|
||||
let (_, lsn_str) = fname.split_at(fname.len() - 16);
|
||||
|
||||
let (lsnhi, lsnlo) = lsn_str.split_at(8);
|
||||
let lsn_hi = u64::from_str_radix(lsnhi, 16)?;
|
||||
let lsn_lo = u64::from_str_radix(lsnlo, 16)?;
|
||||
let lsn = lsn_hi << 32 | lsn_lo;
|
||||
|
||||
return Ok(lsn);
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
|
||||
|
||||
let caps = re
|
||||
@@ -250,68 +154,23 @@ fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
let relnode: u32 = relnode_str.parse()?;
|
||||
|
||||
let forkname_match = caps.name("forkname");
|
||||
let forkname = if forkname_match.is_none() {
|
||||
None
|
||||
} else {
|
||||
Some(forkname_match.unwrap().as_str())
|
||||
};
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
segno_match.unwrap().as_str().parse::<u32>()?
|
||||
};
|
||||
|
||||
let lsn_hi = u64::from_str_radix(caps.name("lsnhi").unwrap().as_str(), 16)?;
|
||||
let lsn_lo = u64::from_str_radix(caps.name("lsnlo").unwrap().as_str(), 16)?;
|
||||
let lsn_hi: u64 = caps.name("lsnhi").unwrap().as_str().parse()?;
|
||||
let lsn_lo: u64 = caps.name("lsnlo").unwrap().as_str().parse()?;
|
||||
let lsn = lsn_hi << 32 | lsn_lo;
|
||||
|
||||
return Ok((relnode, forknum, segno, lsn));
|
||||
}
|
||||
|
||||
fn parse_nonrel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
|
||||
//TODO parse segno from xact filenames too
|
||||
if let Some(fname) = path.strip_prefix("pg_xact/") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_XACT_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
} else if let Some(fname) = path.strip_prefix("pg_multixact/offsets") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_MXACT_OFFSETS_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
} else if let Some(fname) = path.strip_prefix("pg_multixact/members") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_MXACT_MEMBERS_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
} else {
|
||||
return Err(FilePathError::new("invalid non relation data file name"));
|
||||
}
|
||||
Ok((relnode, forknum, segno, lsn))
|
||||
}
|
||||
|
||||
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
|
||||
@@ -333,48 +192,22 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
if fname.contains("pg_control") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_CONTROLFILE_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
}
|
||||
|
||||
if fname.contains("pg_filenode") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
}
|
||||
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
});
|
||||
})
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split("/");
|
||||
let dbnode_str = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
let dbnode = u32::from_str_radix(dbnode_str, 10)?;
|
||||
let dbnode: u32 = dbnode_str.parse()?;
|
||||
let fname = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
@@ -382,34 +215,21 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
};
|
||||
|
||||
if fname.contains("pg_filenode") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
}
|
||||
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
});
|
||||
})
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
// TODO
|
||||
return Err(FilePathError::new("tablespaces not supported"));
|
||||
Err(FilePathError::new("tablespaces not supported"))
|
||||
} else {
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
Err(FilePathError::new("invalid relation data file name"))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -432,55 +252,23 @@ async fn slurp_base_file(
|
||||
|
||||
let mut bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), sys_id);
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
|
||||
// pg_filenode.map has non-standard size - 512 bytes
|
||||
if parsed.forknum == pg_constants::PG_FILENODEMAP_FORKNUM {
|
||||
let b = bytes.clone();
|
||||
controlfile::decode_filemapping(b);
|
||||
while bytes.remaining() >= 512 {
|
||||
let tag = page_cache::BufferTag {
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
|
||||
while bytes.remaining() >= 8192 {
|
||||
let tag = page_cache::BufferTag {
|
||||
rel: page_cache::RelTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
blknum: 0,
|
||||
};
|
||||
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(512));
|
||||
}
|
||||
|
||||
let tag = page_cache::RelTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
forknum: parsed.forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
|
||||
pcache.relsize_inc(&tag, Some(0));
|
||||
} else {
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
|
||||
let reltag = page_cache::RelTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
};
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
|
||||
|
||||
while bytes.remaining() >= 8192 {
|
||||
let tag = page_cache::BufferTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
blknum: blknum,
|
||||
};
|
||||
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
|
||||
pcache.relsize_inc(&reltag, Some(blknum));
|
||||
|
||||
blknum += 1;
|
||||
}
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,6 @@ use tui::text::{Span, Spans, Text};
|
||||
use tui::widgets::{Block, BorderType, Borders, Paragraph, Widget};
|
||||
use tui::Terminal;
|
||||
|
||||
use slog;
|
||||
use slog::Drain;
|
||||
|
||||
lazy_static! {
|
||||
@@ -32,7 +31,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -42,7 +41,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -53,7 +52,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -66,7 +65,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -85,14 +84,14 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
false
|
||||
})
|
||||
.fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
return slog_scope::set_global_logger(logger);
|
||||
slog_scope::set_global_logger(logger)
|
||||
}
|
||||
|
||||
pub fn ui_main<'b>() -> Result<(), Box<dyn Error>> {
|
||||
pub fn ui_main() -> Result<(), Box<dyn Error>> {
|
||||
// Terminal initialization
|
||||
let stdout = io::stdout().into_raw_mode()?;
|
||||
let stdout = MouseTerminal::from(stdout);
|
||||
@@ -172,6 +171,11 @@ pub fn ui_main<'b>() -> Result<(), Box<dyn Error>> {
|
||||
})?;
|
||||
|
||||
// If ther user presses 'q', quit.
|
||||
|
||||
// silence clippy's suggestion to rewrite this as an if-statement. Match
|
||||
// makes more sense as soon as we get another command than 'q'.
|
||||
#[allow(clippy::single_match)]
|
||||
#[allow(clippy::collapsible_match)]
|
||||
if let Event::Input(key) = events.next()? {
|
||||
match key {
|
||||
Key::Char('q') => {
|
||||
@@ -188,6 +192,7 @@ pub fn ui_main<'b>() -> Result<(), Box<dyn Error>> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
struct LogWidget<'a> {
|
||||
logger: &'a TuiLogger,
|
||||
title: &'a str,
|
||||
@@ -229,7 +234,7 @@ impl<'a> Widget for LogWidget<'a> {
|
||||
// Render a widget to show some metrics
|
||||
struct MetricsWidget {}
|
||||
|
||||
fn get_metric_u64<'a>(title: &'a str, value: u64) -> Spans<'a> {
|
||||
fn _get_metric_u64(title: &str, value: u64) -> Spans {
|
||||
Spans::from(vec![
|
||||
Span::styled(format!("{:<20}", title), Style::default()),
|
||||
Span::raw(": "),
|
||||
@@ -240,7 +245,9 @@ fn get_metric_u64<'a>(title: &'a str, value: u64) -> Spans<'a> {
|
||||
])
|
||||
}
|
||||
|
||||
fn get_metric_str<'a>(title: &'a str, value: &'a str) -> Spans<'a> {
|
||||
// This is not used since LSNs were removed from page cache stats.
|
||||
// Maybe it will be used in the future?
|
||||
fn _get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
|
||||
Spans::from(vec![
|
||||
Span::styled(format!("{:<20}", title), Style::default()),
|
||||
Span::raw(": "),
|
||||
@@ -248,13 +255,6 @@ fn get_metric_str<'a>(title: &'a str, value: &'a str) -> Spans<'a> {
|
||||
])
|
||||
}
|
||||
|
||||
// FIXME: We really should define a datatype for LSNs, with Display trait and
|
||||
// helper functions. There's one in tokio-postgres, but I don't think we want
|
||||
// to rely on that.
|
||||
fn format_lsn(lsn: u64) -> String {
|
||||
return format!("{:X}/{:X}", lsn >> 32, lsn & 0xffff_ffff);
|
||||
}
|
||||
|
||||
impl tui::widgets::Widget for MetricsWidget {
|
||||
fn render(self, area: Rect, buf: &mut Buffer) {
|
||||
let block = Block::default()
|
||||
@@ -265,17 +265,24 @@ impl tui::widgets::Widget for MetricsWidget {
|
||||
|
||||
block.render(area, buf);
|
||||
|
||||
#[allow(unused_mut)]
|
||||
let mut lines: Vec<Spans> = Vec::new();
|
||||
|
||||
let page_cache_stats = crate::page_cache::get_stats();
|
||||
// FIXME
|
||||
//let page_cache_stats = crate::page_cache::get_stats();
|
||||
|
||||
// This is not used since LSNs were removed from page cache stats.
|
||||
// Maybe it will be used in the future?
|
||||
/*
|
||||
let lsnrange = format!(
|
||||
"{} - {}",
|
||||
format_lsn(page_cache_stats.first_valid_lsn),
|
||||
format_lsn(page_cache_stats.last_valid_lsn)
|
||||
page_cache_stats.first_valid_lsn, page_cache_stats.last_valid_lsn
|
||||
);
|
||||
let last_valid_recordlsn_str = format_lsn(page_cache_stats.last_record_lsn);
|
||||
let last_valid_recordlsn_str = page_cache_stats.last_record_lsn.to_string();
|
||||
lines.push(get_metric_str("Valid LSN range", &lsnrange));
|
||||
lines.push(get_metric_str("Last record LSN", &last_valid_recordlsn_str));
|
||||
*/
|
||||
/*
|
||||
lines.push(get_metric_u64(
|
||||
"# of cache entries",
|
||||
page_cache_stats.num_entries,
|
||||
@@ -292,7 +299,7 @@ impl tui::widgets::Widget for MetricsWidget {
|
||||
"# of GetPage@LSN calls",
|
||||
page_cache_stats.num_getpage_requests,
|
||||
));
|
||||
|
||||
*/
|
||||
let text = Text::from(lines);
|
||||
|
||||
Paragraph::new(text).render(inner_area, buf);
|
||||
|
||||
@@ -10,7 +10,6 @@ use std::time::Duration;
|
||||
use termion::event::Key;
|
||||
use termion::input::TermRead;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub enum Event<I> {
|
||||
Input(I),
|
||||
Tick,
|
||||
@@ -77,8 +76,8 @@ impl Events {
|
||||
};
|
||||
Events {
|
||||
rx,
|
||||
ignore_exit_key,
|
||||
input_handle,
|
||||
ignore_exit_key,
|
||||
tick_handle,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
//
|
||||
use chrono::offset::Local;
|
||||
use chrono::DateTime;
|
||||
use slog;
|
||||
use slog::{Drain, Level, OwnedKVList, Record};
|
||||
use slog_async::AsyncRecord;
|
||||
use std::collections::VecDeque;
|
||||
@@ -52,7 +51,7 @@ impl Drain for TuiLogger {
|
||||
events.pop_back();
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,7 +80,7 @@ impl<'b> TuiLoggerWidget<'b> {
|
||||
style_trace: None,
|
||||
style_info: None,
|
||||
show_module: true,
|
||||
logger: logger,
|
||||
logger,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -168,7 +167,7 @@ impl<'b> Widget for TuiLoggerWidget<'b> {
|
||||
Level::Debug => (self.style_debug, "DEBUG", true),
|
||||
Level::Trace => (self.style_trace, "TRACE", true),
|
||||
};
|
||||
line.push(Span::styled(txt, lvl_style.unwrap_or(Style::default())));
|
||||
line.push(Span::styled(txt, lvl_style.unwrap_or_default()));
|
||||
|
||||
if self.show_module {
|
||||
line.push(Span::raw(" "));
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,168 +1,211 @@
|
||||
//
|
||||
// WAL receiver
|
||||
//
|
||||
// The WAL receiver connects to the WAL safekeeper service, and streams WAL.
|
||||
// For each WAL record, it decodes the record to figure out which data blocks
|
||||
// the record affects, and adds the records to the page cache.
|
||||
//
|
||||
use log::*;
|
||||
|
||||
use tokio::runtime;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tokio_stream::StreamExt;
|
||||
//!
|
||||
//! WAL receiver
|
||||
//!
|
||||
//! The WAL receiver connects to the WAL safekeeper service, and streams WAL.
|
||||
//! For each WAL record, it decodes the record to figure out which data blocks
|
||||
//! the record affects, and adds the records to the page cache.
|
||||
//!
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::BufferTag;
|
||||
use crate::waldecoder::WalStreamDecoder;
|
||||
use crate::repository::*;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Error;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
use postgres::replication::ReplicationIter;
|
||||
use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use tokio_postgres::{connect_replication, Error, NoTls, ReplicationMode};
|
||||
use postgres_types::PgLsn;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
//
|
||||
// We keep one WAL Receiver active per timeline.
|
||||
//
|
||||
struct WalReceiverEntry {
|
||||
wal_producer_connstr: String,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref WAL_RECEIVERS: Mutex<HashMap<ZTimelineId, WalReceiverEntry>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
|
||||
match receivers.get_mut(&timelineid) {
|
||||
Some(receiver) => {
|
||||
receiver.wal_producer_connstr = wal_producer_connstr.into();
|
||||
}
|
||||
None => {
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Also launch a new thread to handle this connection
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
thread_main(conf, timelineid);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Look up current WAL producer connection string in the hash table
|
||||
fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
|
||||
let receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
|
||||
receivers
|
||||
.get(&timelineid)
|
||||
.unwrap()
|
||||
.wal_producer_connstr
|
||||
.clone()
|
||||
}
|
||||
|
||||
//
|
||||
// This is the entry point for the WAL receiver thread.
|
||||
//
|
||||
pub fn thread_main(conf: PageServerConf, wal_producer_connstr: &String) {
|
||||
info!("WAL receiver thread started: '{}'", wal_producer_connstr);
|
||||
fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId) {
|
||||
info!(
|
||||
"WAL receiver thread started for timeline : '{}'",
|
||||
timelineid
|
||||
);
|
||||
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
//
|
||||
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
|
||||
// and start streaming WAL from it. If the connection is lost, keep retrying.
|
||||
//
|
||||
loop {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
|
||||
runtime.block_on(async {
|
||||
loop {
|
||||
let _res = walreceiver_main(conf.clone(), wal_producer_connstr).await;
|
||||
let res = walreceiver_main(conf, timelineid, &wal_producer_connstr);
|
||||
|
||||
// TODO: print/log the error
|
||||
if let Err(e) = res {
|
||||
info!(
|
||||
"WAL streaming connection failed, retrying in 1 second...: {:?}",
|
||||
_res
|
||||
"WAL streaming connection failed ({}), retrying in 1 second",
|
||||
e
|
||||
);
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
sleep(Duration::from_secs(1));
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async fn walreceiver_main(
|
||||
conf: PageServerConf,
|
||||
wal_producer_connstr: &String,
|
||||
fn walreceiver_main(
|
||||
_conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
) -> Result<(), Error> {
|
||||
// Connect to the database in replication mode.
|
||||
debug!("connecting to {}...", wal_producer_connstr);
|
||||
let (mut rclient, connection) = connect_replication(
|
||||
wal_producer_connstr.as_str(),
|
||||
NoTls,
|
||||
ReplicationMode::Physical,
|
||||
)
|
||||
.await?;
|
||||
debug!("connected!");
|
||||
info!("connecting to {:?}", wal_producer_connstr);
|
||||
let connect_cfg = format!("{} replication=true", wal_producer_connstr);
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
error!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
let mut rclient = Client::connect(&connect_cfg, NoTls)?;
|
||||
info!("connected!");
|
||||
|
||||
let identify_system = rclient.identify_system().await?;
|
||||
let end_of_wal = u64::from(identify_system.xlogpos());
|
||||
let identify = identify_system(&mut rclient)?;
|
||||
info!("{:?}", identify);
|
||||
let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
|
||||
let mut caught_up = false;
|
||||
|
||||
let sysid: u64 = identify_system.systemid().parse().unwrap();
|
||||
let pcache = page_cache::get_pagecache(conf, sysid);
|
||||
let repository = page_cache::get_repository();
|
||||
let timeline = repository.get_timeline(timelineid).unwrap();
|
||||
|
||||
//
|
||||
// Start streaming the WAL, from where we left off previously.
|
||||
//
|
||||
let mut startpoint = pcache.get_last_valid_lsn();
|
||||
if startpoint == 0 {
|
||||
// If we start here with identify_system.xlogpos() we will have race condition with
|
||||
// postgres start: insert into postgres may request page that was modified with lsn
|
||||
// smaller than identify_system.xlogpos().
|
||||
//
|
||||
// Current procedure for starting postgres will anyway be changed to something
|
||||
// different like having 'initdb' method on a pageserver (or importing some shared
|
||||
// empty database snapshot), so for now I just put start of first segment which
|
||||
// seems to be a valid record.
|
||||
pcache.init_valid_lsn(0x_1_000_000_u64);
|
||||
startpoint = u64::from(0x_1_000_000_u64);
|
||||
} else {
|
||||
// There might be some padding after the last full record, skip it.
|
||||
//
|
||||
// FIXME: It probably would be better to always start streaming from the beginning
|
||||
// of the page, or the segment, so that we could check the page/segment headers
|
||||
// too. Just for the sake of paranoia.
|
||||
if startpoint % 8 != 0 {
|
||||
startpoint += 8 - (startpoint % 8);
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"starting replication from {:X}/{:X}, server is at {:X}/{:X}...",
|
||||
(startpoint >> 32),
|
||||
(startpoint & 0xffffffff),
|
||||
(end_of_wal >> 32),
|
||||
(end_of_wal & 0xffffffff)
|
||||
);
|
||||
let startpoint = tokio_postgres::types::Lsn::from(startpoint);
|
||||
let mut physical_stream = rclient
|
||||
.start_physical_replication(None, startpoint, None)
|
||||
.await?;
|
||||
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
|
||||
// If we had previously received WAL up to some point in the middle of a WAL record, we
|
||||
// better start from the end of last full WAL record, not in the middle of one. Hence,
|
||||
// use 'last_record_lsn' rather than 'last_valid_lsn' here.
|
||||
let last_rec_lsn = timeline.get_last_record_lsn();
|
||||
let mut startpoint = last_rec_lsn;
|
||||
|
||||
while let Some(replication_message) = physical_stream.next().await {
|
||||
match replication_message? {
|
||||
if startpoint == Lsn(0) {
|
||||
error!("No previous WAL position");
|
||||
}
|
||||
|
||||
startpoint = Lsn::max(
|
||||
startpoint,
|
||||
Lsn(end_of_wal.0 & !(pg_constants::WAL_SEGMENT_SIZE as u64 - 1)),
|
||||
);
|
||||
|
||||
// There might be some padding after the last full record, skip it.
|
||||
//
|
||||
// FIXME: It probably would be better to always start streaming from the beginning
|
||||
// of the page, or the segment, so that we could check the page/segment headers
|
||||
// too. Just for the sake of paranoia.
|
||||
startpoint += startpoint.calc_padding(8u32);
|
||||
|
||||
debug!(
|
||||
"last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, timelineid, end_of_wal
|
||||
);
|
||||
|
||||
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
|
||||
|
||||
let copy_stream = rclient.copy_both_simple(&query)?;
|
||||
let mut physical_stream = ReplicationIter::new(copy_stream);
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let mut checkpoint = CheckPoint::new(startpoint.0, identify.timeline);
|
||||
let checkpoint_tag = BufferTag::fork(pg_constants::PG_CHECKPOINT_FORKNUM);
|
||||
if let Some(checkpoint_bytes) = timeline.get_page_image(checkpoint_tag, Lsn(0))? {
|
||||
checkpoint = decode_checkpoint(checkpoint_bytes)?;
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
} else {
|
||||
error!("No checkpoint record was found in reposistory");
|
||||
}
|
||||
while let Some(replication_message) = physical_stream.next()? {
|
||||
match replication_message {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
let data = xlog_data.data();
|
||||
let startlsn = xlog_data.wal_start();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
|
||||
trace!(
|
||||
"received XLogData between {:X}/{:X} and {:X}/{:X}",
|
||||
(startlsn >> 32),
|
||||
(startlsn & 0xffffffff),
|
||||
(endlsn >> 32),
|
||||
(endlsn & 0xffffffff)
|
||||
);
|
||||
write_wal_file(startlsn, timelineid, pg_constants::WAL_SEGMENT_SIZE, data)?;
|
||||
|
||||
trace!("received XLogData between {} and {}", startlsn, endlsn);
|
||||
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
loop {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode() {
|
||||
let decoded =
|
||||
crate::waldecoder::decode_wal_record(startlsn, recdata.clone());
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let old_checkpoint_bytes = encode_checkpoint(checkpoint);
|
||||
let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
|
||||
timeline.save_decoded_record(decoded, recdata, lsn)?;
|
||||
|
||||
// Put the WAL record to the page cache. We make a separate copy of
|
||||
// it for every block it modifies. (The actual WAL record is kept in
|
||||
// a Bytes, which uses a reference counter for the underlying buffer,
|
||||
// so having multiple copies of it doesn't cost that much)
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn: lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
};
|
||||
|
||||
pcache.put_wal_record(tag, rec);
|
||||
}
|
||||
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
pcache.advance_last_valid_lsn(lsn);
|
||||
} else {
|
||||
break;
|
||||
let new_checkpoint_bytes = encode_checkpoint(checkpoint);
|
||||
if new_checkpoint_bytes != old_checkpoint_bytes {
|
||||
timeline.put_page_image(checkpoint_tag, Lsn(0), new_checkpoint_bytes);
|
||||
}
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
}
|
||||
|
||||
// Update the last_valid LSN value in the page cache one more time. We updated
|
||||
@@ -171,24 +214,180 @@ async fn walreceiver_main(
|
||||
// better reflect that, because GetPage@LSN requests might also point in the
|
||||
// middle of a record, if the request LSN was taken from the server's current
|
||||
// flush ptr.
|
||||
pcache.advance_last_valid_lsn(endlsn);
|
||||
timeline.advance_last_valid_lsn(endlsn);
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!(
|
||||
"caught up at LSN {:X}/{:X}",
|
||||
(endlsn >> 32),
|
||||
(endlsn & 0xffffffff)
|
||||
);
|
||||
info!("caught up at LSN {}", endlsn);
|
||||
caught_up = true;
|
||||
}
|
||||
}
|
||||
|
||||
ReplicationMessage::PrimaryKeepAlive(_keepalive) => {
|
||||
trace!("received PrimaryKeepAlive");
|
||||
// FIXME: Reply, or the connection will time out
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
let wal_end = keepalive.wal_end();
|
||||
let timestamp = keepalive.timestamp();
|
||||
let reply_requested: bool = keepalive.reply() != 0;
|
||||
|
||||
trace!(
|
||||
"received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})",
|
||||
wal_end,
|
||||
timestamp,
|
||||
reply_requested,
|
||||
);
|
||||
if reply_requested {
|
||||
// TODO: More thought should go into what values are sent here.
|
||||
let last_lsn = PgLsn::from(u64::from(timeline.get_last_valid_lsn()));
|
||||
let write_lsn = last_lsn;
|
||||
let flush_lsn = last_lsn;
|
||||
let apply_lsn = PgLsn::from(0);
|
||||
let ts = SystemTime::now();
|
||||
const NO_REPLY: u8 = 0u8;
|
||||
|
||||
physical_stream
|
||||
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
///
|
||||
/// See the [postgres docs] for more details.
|
||||
///
|
||||
/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html
|
||||
#[derive(Debug)]
|
||||
pub struct IdentifySystem {
|
||||
systemid: u64,
|
||||
timeline: u32,
|
||||
xlogpos: PgLsn,
|
||||
dbname: Option<String>,
|
||||
}
|
||||
|
||||
/// There was a problem parsing the response to
|
||||
/// a postgres IDENTIFY_SYSTEM command.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("IDENTIFY_SYSTEM parse error")]
|
||||
pub struct IdentifyError;
|
||||
|
||||
/// Run the postgres `IDENTIFY_SYSTEM` command
|
||||
pub fn identify_system(client: &mut Client) -> Result<IdentifySystem, Error> {
|
||||
let query_str = "IDENTIFY_SYSTEM";
|
||||
let response = client.simple_query(query_str)?;
|
||||
|
||||
// get(N) from row, then parse it as some destination type.
|
||||
fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
|
||||
where
|
||||
T: FromStr,
|
||||
{
|
||||
let val = row.get(idx).ok_or(IdentifyError)?;
|
||||
val.parse::<T>().or(Err(IdentifyError))
|
||||
}
|
||||
|
||||
// extract the row contents into an IdentifySystem struct.
|
||||
// written as a closure so I can use ? for Option here.
|
||||
if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
|
||||
Ok(IdentifySystem {
|
||||
systemid: get_parse(first_row, 0)?,
|
||||
timeline: get_parse(first_row, 1)?,
|
||||
xlogpos: get_parse(first_row, 2)?,
|
||||
dbname: get_parse(first_row, 3).ok(),
|
||||
})
|
||||
} else {
|
||||
Err(IdentifyError.into())
|
||||
}
|
||||
}
|
||||
|
||||
fn write_wal_file(
|
||||
startpos: Lsn,
|
||||
timeline: ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
buf: &[u8],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut bytes_left: usize = buf.len();
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size);
|
||||
|
||||
while bytes_left != 0 {
|
||||
let bytes_to_write;
|
||||
|
||||
/*
|
||||
* If crossing a WAL boundary, only write up until we reach wal
|
||||
* segment size.
|
||||
*/
|
||||
if xlogoff + bytes_left > wal_seg_size {
|
||||
bytes_to_write = wal_seg_size - xlogoff;
|
||||
} else {
|
||||
bytes_to_write = bytes_left;
|
||||
}
|
||||
|
||||
/* Open file */
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
segno,
|
||||
wal_seg_size,
|
||||
);
|
||||
let wal_file_path = wal_dir.join(wal_file_name.clone());
|
||||
let wal_file_partial_path = wal_dir.join(wal_file_name.clone() + ".partial");
|
||||
|
||||
{
|
||||
let mut wal_file: File;
|
||||
/* Try to open already completed segment */
|
||||
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
|
||||
wal_file = file;
|
||||
partial = false;
|
||||
} else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) {
|
||||
/* Try to open existed partial file */
|
||||
wal_file = file;
|
||||
partial = true;
|
||||
} else {
|
||||
/* Create and fill new partial file */
|
||||
partial = true;
|
||||
match OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.open(&wal_file_partial_path)
|
||||
{
|
||||
Ok(mut file) => {
|
||||
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
|
||||
file.write_all(&ZERO_BLOCK)?;
|
||||
}
|
||||
wal_file = file;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
wal_file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?;
|
||||
|
||||
// FIXME: Flush the file
|
||||
//wal_file.sync_all()?;
|
||||
}
|
||||
/* Write was successful, advance our position */
|
||||
bytes_written += bytes_to_write;
|
||||
bytes_left -= bytes_to_write;
|
||||
start_pos += bytes_to_write as u64;
|
||||
xlogoff += bytes_to_write;
|
||||
|
||||
/* Did we reach the end of a WAL segment? */
|
||||
if start_pos.segment_offset(wal_seg_size) == 0 {
|
||||
xlogoff = 0;
|
||||
if partial {
|
||||
fs::rename(&wal_file_partial_path, &wal_file_path)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,162 +1,522 @@
|
||||
//
|
||||
// WAL redo
|
||||
//
|
||||
// We rely on Postgres to perform WAL redo for us. We launch a
|
||||
// postgres process in special "wal redo" mode that's similar to
|
||||
// single-user mode. We then pass the the previous page image, if any,
|
||||
// and all the WAL records we want to apply, to the postgress
|
||||
// process. Then we get the page image back. Communication with the
|
||||
// postgres process happens via stdin/stdout
|
||||
//
|
||||
// See src/backend/tcop/zenith_wal_redo.c for the other side of
|
||||
// this communication.
|
||||
//
|
||||
// TODO: Even though the postgres code runs in a separate process,
|
||||
// it's not a secure sandbox.
|
||||
//
|
||||
//!
|
||||
//! WAL redo
|
||||
//!
|
||||
//! We rely on Postgres to perform WAL redo for us. We launch a
|
||||
//! postgres process in special "wal redo" mode that's similar to
|
||||
//! single-user mode. We then pass the the previous page image, if any,
|
||||
//! and all the WAL records we want to apply, to the postgres
|
||||
//! process. Then we get the page image back. Communication with the
|
||||
//! postgres process happens via stdin/stdout
|
||||
//!
|
||||
//! See src/backend/tcop/zenith_wal_redo.c for the other side of
|
||||
//! this communication.
|
||||
//!
|
||||
//! TODO: Even though the postgres code runs in a separate process,
|
||||
//! it's not a secure sandbox.
|
||||
//!
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use std::assert;
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::Error;
|
||||
use std::sync::Arc;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Stdio;
|
||||
use std::sync::mpsc;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::{path::PathBuf, process::Stdio};
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::process::{Child, ChildStdin, ChildStdout, Command};
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::process::{ChildStdin, ChildStdout, Command};
|
||||
use tokio::time::timeout;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use crate::repository::BufferTag;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder::{MultiXactId, XlMultiXactCreate};
|
||||
use crate::PageServerConf;
|
||||
use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::XLogRecord;
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::CacheEntry;
|
||||
use crate::page_cache::WALRecord;
|
||||
use crate::{page_cache::BufferTag, PageServerConf};
|
||||
///
|
||||
/// WAL Redo Manager is responsible for replaying WAL records.
|
||||
///
|
||||
/// Callers use the WAL redo manager through this abstract interface,
|
||||
/// which makes it easy to mock it in tests.
|
||||
pub trait WalRedoManager: Send + Sync {
|
||||
/// Apply some WAL records.
|
||||
///
|
||||
/// The caller passes an old page image, and WAL records that should be
|
||||
/// applied over it. The return value is a new page image, after applying
|
||||
/// the reords.
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError>;
|
||||
}
|
||||
|
||||
static TIMEOUT: Duration = Duration::from_secs(20);
|
||||
|
||||
//
|
||||
// Main entry point for the WAL applicator thread.
|
||||
//
|
||||
pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) {
|
||||
info!("WAL redo thread started {}", sys_id);
|
||||
///
|
||||
/// The implementation consists of two parts: PostgresRedoManager, and
|
||||
/// PostgresRedoManagerInternal. PostgresRedoManager is the public struct
|
||||
/// that can be used to send redo requests to the manager.
|
||||
/// PostgresRedoManagerInternal is used by the manager thread itself.
|
||||
///
|
||||
pub struct PostgresRedoManager {
|
||||
request_tx: Mutex<mpsc::Sender<WalRedoRequest>>,
|
||||
}
|
||||
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
// runtime for the async part.
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
struct PostgresRedoManagerInternal {
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), sys_id);
|
||||
request_rx: mpsc::Receiver<WalRedoRequest>,
|
||||
}
|
||||
|
||||
// Loop forever, handling requests as they come.
|
||||
let walredo_channel_receiver = &pcache.walredo_receiver;
|
||||
loop {
|
||||
let mut process: WalRedoProcess;
|
||||
let datadir = conf.data_dir.join(format!("wal-redo/{}", sys_id));
|
||||
#[derive(Debug)]
|
||||
struct WalRedoRequest {
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
|
||||
info!("launching WAL redo postgres process {}", sys_id);
|
||||
{
|
||||
let _guard = runtime.enter();
|
||||
process = WalRedoProcess::launch(&datadir, &runtime).unwrap();
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
|
||||
response_channel: mpsc::Sender<Result<Bytes, WalRedoError>>,
|
||||
}
|
||||
|
||||
/// An error happened in WAL redo
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum WalRedoError {
|
||||
#[error(transparent)]
|
||||
IoError(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
///
|
||||
/// Public interface of WAL redo manager
|
||||
///
|
||||
impl PostgresRedoManager {
|
||||
///
|
||||
/// Create a new PostgresRedoManager.
|
||||
///
|
||||
/// This launches a new thread to handle the requests.
|
||||
pub fn new(conf: &'static PageServerConf) -> PostgresRedoManager {
|
||||
let (tx, rx) = mpsc::channel();
|
||||
|
||||
//
|
||||
// Launch the WAL redo thread
|
||||
//
|
||||
// Get mutable references to the values that we need to pass to the
|
||||
// thread.
|
||||
let request_rx = rx;
|
||||
|
||||
// Currently, the join handle is not saved anywhere and we
|
||||
// won't try restart the thread if it dies.
|
||||
let _walredo_thread = std::thread::Builder::new()
|
||||
.name("WAL redo thread".into())
|
||||
.spawn(move || {
|
||||
let mut internal = PostgresRedoManagerInternal { conf, request_rx };
|
||||
internal.wal_redo_main();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
PostgresRedoManager {
|
||||
request_tx: Mutex::new(tx),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pretty arbitrarily, reuse the same Postgres process for 100 requests.
|
||||
// After that, kill it and start a new one. This is mostly to avoid
|
||||
// using up all shared buffers in Postgres's shared buffer cache; we don't
|
||||
// want to write any pages to disk in the WAL redo process.
|
||||
for _i in 1..100 {
|
||||
let request = walredo_channel_receiver.recv().unwrap();
|
||||
impl WalRedoManager for PostgresRedoManager {
|
||||
///
|
||||
/// Request the WAL redo manager to apply some WAL records
|
||||
///
|
||||
/// The WAL redo is handled by a separate thread, so this just sends a request
|
||||
/// to the thread and waits for response.
|
||||
///
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
// Create a channel where to receive the response
|
||||
let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
|
||||
|
||||
let result = handle_apply_request(&pcache, &process, &runtime, request);
|
||||
if result.is_err() {
|
||||
// On error, kill the process.
|
||||
break;
|
||||
let request = WalRedoRequest {
|
||||
tag,
|
||||
lsn,
|
||||
base_img,
|
||||
records,
|
||||
response_channel: tx,
|
||||
};
|
||||
|
||||
self.request_tx
|
||||
.lock()
|
||||
.unwrap()
|
||||
.send(request)
|
||||
.expect("could not send WAL redo request");
|
||||
|
||||
rx.recv()
|
||||
.expect("could not receive response to WAL redo request")
|
||||
}
|
||||
}
|
||||
|
||||
fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
|
||||
return ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
|
||||
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
|
||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize;
|
||||
}
|
||||
|
||||
fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
|
||||
return (xid as u16) % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP
|
||||
* pg_constants::MXACT_MEMBER_BITS_PER_XACT;
|
||||
}
|
||||
|
||||
/* Location (byte offset within page) of TransactionId of given member */
|
||||
fn mx_offset_to_member_offset(xid: MultiXactId) -> usize {
|
||||
return mx_offset_to_flags_offset(xid)
|
||||
+ (pg_constants::MULTIXACT_FLAGBYTES_PER_GROUP
|
||||
+ (xid as u16 % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP) * 4)
|
||||
as usize;
|
||||
}
|
||||
|
||||
///
|
||||
/// WAL redo thread
|
||||
///
|
||||
impl PostgresRedoManagerInternal {
|
||||
//
|
||||
// Main entry point for the WAL applicator thread.
|
||||
//
|
||||
fn wal_redo_main(&mut self) {
|
||||
info!("WAL redo thread started");
|
||||
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
// runtime for the async part.
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let process: PostgresRedoProcess;
|
||||
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = self.conf.workdir.join("wal-redo-datadir");
|
||||
|
||||
info!("launching WAL redo postgres process");
|
||||
|
||||
process = runtime
|
||||
.block_on(PostgresRedoProcess::launch(&datadir))
|
||||
.unwrap();
|
||||
|
||||
// Loop forever, handling requests as they come.
|
||||
loop {
|
||||
let request = self
|
||||
.request_rx
|
||||
.recv()
|
||||
.expect("WAL redo request channel was closed");
|
||||
|
||||
let result = runtime.block_on(self.handle_apply_request(&process, &request));
|
||||
let result_ok = result.is_ok();
|
||||
|
||||
// Send the result to the requester
|
||||
let _ = request.response_channel.send(result);
|
||||
|
||||
if !result_ok {
|
||||
error!("wal-redo-postgres failed to apply request {:?}", request);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("killing WAL redo postgres process");
|
||||
let _ = runtime.block_on(process.stdin.get_mut().shutdown());
|
||||
let mut child = process.child;
|
||||
drop(process.stdin);
|
||||
let _ = runtime.block_on(child.wait());
|
||||
///
|
||||
/// Process one request for WAL redo.
|
||||
///
|
||||
async fn handle_apply_request(
|
||||
&self,
|
||||
process: &PostgresRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let tag = request.tag;
|
||||
let lsn = request.lsn;
|
||||
let base_img = request.base_img.clone();
|
||||
let records = &request.records;
|
||||
|
||||
let nrecords = records.len();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
if tag.rel.forknum > pg_constants::INIT_FORKNUM {
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let mut page = BytesMut::new();
|
||||
if let Some(fpi) = base_img {
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
} else {
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
|
||||
//move to main data
|
||||
// TODO probably, we should store some records in our special format
|
||||
// to avoid this weird parsing on replay
|
||||
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
|
||||
if buf.remaining() > skip {
|
||||
buf.advance(skip);
|
||||
}
|
||||
|
||||
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let mut status = 0;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
|
||||
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
|
||||
}
|
||||
//handle subtrans
|
||||
let _xact_time = buf.get_i64_le();
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag.blknum == blkno {
|
||||
status = pg_constants::TRANSACTION_STATUS_SUB_COMMITTED;
|
||||
transaction_id_set_status(subxact, status, &mut page);
|
||||
}
|
||||
}
|
||||
}
|
||||
if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO handle this too?
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
|
||||
let xid = buf.get_u32_le();
|
||||
transaction_id_set_status(xid, status, &mut page);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT || info == pg_constants::XLOG_XACT_ABORT_PREPARED {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
if info == pg_constants::XLOG_XACT_ABORT {
|
||||
transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
|
||||
}
|
||||
//handle subtrans
|
||||
let _xact_time = buf.get_i64_le();
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag.blknum == blkno {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
transaction_id_set_status(subxact, status, &mut page);
|
||||
}
|
||||
}
|
||||
}
|
||||
if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO handle this too?
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
|
||||
let xid = buf.get_u32_le();
|
||||
transaction_id_set_status(xid, status, &mut page);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
info!("Apply prepare {} record", xlogrec.xl_xid);
|
||||
page.clear();
|
||||
page.extend_from_slice(&buf[..]);
|
||||
} else {
|
||||
error!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn,
|
||||
record.main_data_offset, record.rec.len());
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
if tag.rel.forknum == pg_constants::PG_MXACT_OFFSETS_FORKNUM {
|
||||
let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
|
||||
* 4) as usize;
|
||||
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
|
||||
} else {
|
||||
assert!(tag.rel.forknum == pg_constants::PG_MXACT_MEMBERS_FORKNUM);
|
||||
for i in 0..xlrec.nmembers {
|
||||
let blkno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
if blkno == tag.blknum {
|
||||
// update only target block
|
||||
let offset = xlrec.moff + i;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
let flagsoff = mx_offset_to_flags_offset(offset);
|
||||
let bshift = mx_offset_to_flags_bitshift(offset);
|
||||
let mut flagsval =
|
||||
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &=
|
||||
!(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
|
||||
<< bshift);
|
||||
flagsval |= xlrec.members[i as usize].status << bshift;
|
||||
LittleEndian::write_u32(
|
||||
&mut page[flagsoff..flagsoff + 4],
|
||||
flagsval,
|
||||
);
|
||||
LittleEndian::write_u32(
|
||||
&mut page[memberoff..memberoff + 4],
|
||||
xlrec.members[i as usize].xid,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
// empty page image indicates that this SLRU page is truncated and can be removed by GC
|
||||
page.clear();
|
||||
} else {
|
||||
assert!(false);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
page.clear();
|
||||
page.extend_from_slice(&buf[12..]); // skip xl_relmap_update
|
||||
assert!(page.len() == 512); // size of pg_filenode.map
|
||||
}
|
||||
}
|
||||
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
} else {
|
||||
apply_result = process.apply_wal_records(tag, base_img, records).await;
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
let result: Result<Bytes, WalRedoError>;
|
||||
|
||||
trace!(
|
||||
"applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
lsn
|
||||
);
|
||||
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {}", e);
|
||||
result = Err(WalRedoError::IoError(e));
|
||||
} else {
|
||||
let img = apply_result.unwrap();
|
||||
|
||||
result = Ok(img);
|
||||
}
|
||||
|
||||
// The caller is responsible for sending the response
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_apply_request(
|
||||
pcache: &page_cache::PageCache,
|
||||
process: &WalRedoProcess,
|
||||
runtime: &Runtime,
|
||||
entry_rc: Arc<CacheEntry>,
|
||||
) -> Result<(), Error> {
|
||||
let tag = entry_rc.key.tag;
|
||||
let lsn = entry_rc.key.lsn;
|
||||
let (base_img, records) = pcache.collect_records_for_apply(entry_rc.as_ref());
|
||||
|
||||
let mut entry = entry_rc.content.lock().unwrap();
|
||||
entry.apply_pending = false;
|
||||
|
||||
let nrecords = records.len();
|
||||
|
||||
let start = Instant::now();
|
||||
let apply_result = process.apply_wal_records(runtime, tag, base_img, records);
|
||||
let duration = start.elapsed();
|
||||
|
||||
let result;
|
||||
|
||||
debug!(
|
||||
"applied {} WAL records in {} ms to reconstruct page image at LSN {:X}/{:X}",
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
lsn >> 32,
|
||||
lsn & 0xffff_ffff
|
||||
);
|
||||
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {}", e);
|
||||
result = Err(e);
|
||||
} else {
|
||||
entry.page_image = Some(apply_result.unwrap());
|
||||
pcache
|
||||
.num_page_images
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
result = Ok(());
|
||||
}
|
||||
|
||||
// Wake up the requester, whether the operation succeeded or not.
|
||||
entry_rc.walredo_condvar.notify_all();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct WalRedoProcess {
|
||||
child: Child,
|
||||
struct PostgresRedoProcess {
|
||||
stdin: RefCell<ChildStdin>,
|
||||
stdout: RefCell<ChildStdout>,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
impl PostgresRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
// Tests who run pageserver binary are setting proper PG_BIN_DIR
|
||||
// and PG_LIB_DIR so that WalRedo would start right postgres. We may later
|
||||
// and PG_LIB_DIR so that WalRedo would start right postgres.
|
||||
|
||||
// do that: We may later
|
||||
// switch to setting same things in pageserver config file.
|
||||
fn launch(datadir: &PathBuf, runtime: &Runtime) -> Result<WalRedoProcess, Error> {
|
||||
// Create empty data directory for wal-redo postgres deleting old one.
|
||||
fs::remove_dir_all(datadir.to_str().unwrap()).ok();
|
||||
let initdb = runtime
|
||||
.block_on(
|
||||
Command::new("initdb")
|
||||
.args(&["-D", datadir.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.output(),
|
||||
)
|
||||
async fn launch(datadir: &Path) -> Result<PostgresRedoProcess, Error> {
|
||||
// Create empty data directory for wal-redo postgres, deleting old one first.
|
||||
if datadir.exists() {
|
||||
info!("directory {:?} exists, removing", &datadir);
|
||||
if let Err(e) = fs::remove_dir_all(&datadir) {
|
||||
error!("could not remove old wal-redo-datadir: {:?}", e);
|
||||
}
|
||||
}
|
||||
info!("running initdb in {:?}", datadir.display());
|
||||
let initdb = Command::new("initdb")
|
||||
.args(&["-D", datadir.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.output()
|
||||
.await
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.status.success() {
|
||||
@@ -165,21 +525,29 @@ impl WalRedoProcess {
|
||||
std::str::from_utf8(&initdb.stdout).unwrap(),
|
||||
std::str::from_utf8(&initdb.stderr).unwrap()
|
||||
);
|
||||
} else {
|
||||
// Limit shared cache for wal-redo-postres
|
||||
let mut config = OpenOptions::new()
|
||||
.append(true)
|
||||
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
|
||||
config.write_all(b"shared_buffers=128kB\n")?;
|
||||
config.write_all(b"fsync=off\n")?;
|
||||
config.write_all(b"shared_preload_libraries=zenith\n")?;
|
||||
config.write_all(b"zenith.wal_redo=on\n")?;
|
||||
}
|
||||
|
||||
// Start postgres itself
|
||||
let mut child = Command::new("postgres")
|
||||
.arg("--wal-redo")
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env("PGDATA", datadir.to_str().unwrap())
|
||||
.env("PGDATA", datadir)
|
||||
.spawn()
|
||||
.expect("postgres --wal-redo command failed to start");
|
||||
|
||||
info!(
|
||||
"launched WAL redo postgres process on {}",
|
||||
datadir.to_str().unwrap()
|
||||
"launched WAL redo postgres process on {:?}",
|
||||
datadir.display()
|
||||
);
|
||||
|
||||
let stdin = child.stdin.take().expect("failed to open child's stdin");
|
||||
@@ -200,15 +568,14 @@ impl WalRedoProcess {
|
||||
if res.unwrap() == 0 {
|
||||
break;
|
||||
}
|
||||
debug!("wal-redo-postgres: {}", line.trim());
|
||||
error!("wal-redo-postgres: {}", line.trim());
|
||||
line.clear();
|
||||
}
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
Ok(WalRedoProcess {
|
||||
child: child,
|
||||
Ok(PostgresRedoProcess {
|
||||
stdin: RefCell::new(stdin),
|
||||
stdout: RefCell::new(stdout),
|
||||
})
|
||||
@@ -218,146 +585,132 @@ impl WalRedoProcess {
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
fn apply_wal_records(
|
||||
async fn apply_wal_records(
|
||||
&self,
|
||||
runtime: &Runtime,
|
||||
tag: BufferTag,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, Error> {
|
||||
records: &[WALRecord],
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
let mut stdin = self.stdin.borrow_mut();
|
||||
let mut stdout = self.stdout.borrow_mut();
|
||||
return runtime.block_on(async {
|
||||
//
|
||||
// This async block sends all the commands to the process.
|
||||
//
|
||||
// For reasons I don't understand, this needs to be a "move" block;
|
||||
// otherwise the stdin pipe doesn't get closed, despite the shutdown()
|
||||
// call.
|
||||
//
|
||||
let f_stdin = async {
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
// information that the child writes to its stderr to the page server's log.
|
||||
//
|
||||
// 'f_stdin' handles writing the base image and WAL records to the child process.
|
||||
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
|
||||
// tokio runtime in the 'launch' function already, forwards the logging.
|
||||
let f_stdin = async {
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
if base_img.is_some() {
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
)
|
||||
.await??;
|
||||
if base_img.is_some() {
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
)
|
||||
.await??;
|
||||
}
|
||||
}
|
||||
|
||||
// Send WAL records.
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
// Send WAL records.
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.await?;
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.await?;
|
||||
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
|
||||
// Send GetPage command to get the result back
|
||||
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
|
||||
timeout(TIMEOUT, stdin.flush()).await??;
|
||||
//debug!("sent GetPage for {}", tag.blknum);
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
// Send GetPage command to get the result back
|
||||
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
|
||||
timeout(TIMEOUT, stdin.flush()).await??;
|
||||
//debug!("sent GetPage for {}", tag.blknum);
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
|
||||
// Read back new page image
|
||||
let f_stdout = async {
|
||||
let mut buf = [0u8; 8192];
|
||||
// Read back new page image
|
||||
let f_stdout = async {
|
||||
let mut buf = [0u8; 8192];
|
||||
|
||||
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
|
||||
//debug!("got response for {}", tag.blknum);
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
|
||||
//debug!("got response for {}", tag.blknum);
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
|
||||
// Kill the process. This closes its stdin, which should signal the process
|
||||
// to terminate. TODO: SIGKILL if needed
|
||||
//child.wait();
|
||||
let res = tokio::try_join!(f_stdout, f_stdin)?;
|
||||
|
||||
let res = futures::try_join!(f_stdout, f_stdin)?;
|
||||
let buf = res.0;
|
||||
|
||||
let buf = res.0;
|
||||
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
});
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
}
|
||||
}
|
||||
|
||||
// Functions for constructing messages to send to the postgres WAL redo
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 5 * 4;
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8('B' as u8);
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
tag.pack(&mut buf);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
return buf.freeze();
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 5 * 4 + base_img.len();
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8('P' as u8);
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
tag.pack(&mut buf);
|
||||
buf.put(base_img);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
return buf.freeze();
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: u64, rec: Bytes) -> Bytes {
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8('A' as u8);
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u64(endlsn);
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put(rec);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
return buf.freeze();
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 5 * 4;
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8('G' as u8);
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
tag.pack(&mut buf);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
return buf.freeze();
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
29
pgbuild.sh
29
pgbuild.sh
@@ -1,29 +0,0 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Purpose of this script is to build and install postgres in a local directory
|
||||
# so that zenith intergation tests would find pg binaries and support files.
|
||||
#
|
||||
# ./pgbuild.sh would do following:
|
||||
#
|
||||
# 1) run out-of-source build of postgres in REPO_ROOT/tmp_install/build directory (I'm reusing
|
||||
# tmp_install path here since it is already present in .gitignore)
|
||||
#
|
||||
# 2) installs postgres to REPO_ROOT/tmp_install/
|
||||
#
|
||||
REPO_ROOT=$(dirname "$0")
|
||||
REPO_ROOT="`( cd \"$REPO_ROOT\" && pwd )`"
|
||||
|
||||
# configure
|
||||
echo "Configuring postgres build"
|
||||
mkdir -p $REPO_ROOT/tmp_install/build
|
||||
cd $REPO_ROOT/tmp_install/build
|
||||
../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
|
||||
--enable-depend --with-libxml --prefix=/ > configure.log
|
||||
|
||||
# compile
|
||||
echo "Compiling postgres"
|
||||
make -j8 -s
|
||||
export DESTDIR=$REPO_ROOT/tmp_install
|
||||
|
||||
echo "Installing postgres to $DESTDIR"
|
||||
make install -s
|
||||
24
postgres_ffi/Cargo.toml
Normal file
24
postgres_ffi/Cargo.toml
Normal file
@@ -0,0 +1,24 @@
|
||||
[package]
|
||||
name = "postgres_ffi"
|
||||
version = "0.1.0"
|
||||
authors = ["Heikki Linnakangas <heikki@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
hex = "0.4.3"
|
||||
lazy_static = "1.4"
|
||||
log = "0.4.14"
|
||||
thiserror = "1.0"
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
bindgen = "0.57"
|
||||
3
postgres_ffi/README
Normal file
3
postgres_ffi/README
Normal file
@@ -0,0 +1,3 @@
|
||||
This module contains utility functions for interacting with PostgreSQL
|
||||
file formats.
|
||||
|
||||
44
postgres_ffi/build.rs
Normal file
44
postgres_ffi/build.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
extern crate bindgen;
|
||||
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn main() {
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
println!("cargo:rerun-if-changed=pg_control_ffi.h");
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
let bindings = bindgen::Builder::default()
|
||||
// The input header we would like to generate
|
||||
// bindings for.
|
||||
.header("pg_control_ffi.h")
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
|
||||
.whitelist_type("ControlFileData")
|
||||
.whitelist_type("CheckPoint")
|
||||
.whitelist_type("FullTransactionId")
|
||||
.whitelist_var("PG_CONTROL_FILE_SIZE")
|
||||
.whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
|
||||
.whitelist_type("DBState")
|
||||
// Path the server include dir. It is in tmp_install/include/server, if you did
|
||||
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
|
||||
// and used DESTDIR to move it into tmp_install, then it's in
|
||||
// tmp_install/include/postgres/server
|
||||
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
|
||||
// but this will do for now.
|
||||
.clang_arg("-I../tmp_install/include/server")
|
||||
.clang_arg("-I../tmp_install/include/postgresql/server")
|
||||
// Finish the builder and generate the bindings.
|
||||
.generate()
|
||||
// Unwrap the Result and panic on failure.
|
||||
.expect("Unable to generate bindings");
|
||||
|
||||
// Write the bindings to the $OUT_DIR/bindings.rs file.
|
||||
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
|
||||
bindings
|
||||
.write_to_file(out_path.join("bindings.rs"))
|
||||
.expect("Couldn't write bindings!");
|
||||
}
|
||||
4
postgres_ffi/pg_control_ffi.h
Normal file
4
postgres_ffi/pg_control_ffi.h
Normal file
@@ -0,0 +1,4 @@
|
||||
#include "c.h"
|
||||
#include "catalog/pg_control.h"
|
||||
|
||||
const uint32 PG_CONTROLFILEDATA_OFFSETOF_CRC = offsetof(ControlFileData, crc);
|
||||
112
postgres_ffi/src/lib.rs
Normal file
112
postgres_ffi/src/lib.rs
Normal file
@@ -0,0 +1,112 @@
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
pub mod nonrelfile_utils;
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
pub mod xlog_utils;
|
||||
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
|
||||
// sizeof(ControlFileData)
|
||||
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
|
||||
const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
|
||||
const OFFSETOF_CRC: usize = PG_CONTROLFILEDATA_OFFSETOF_CRC as usize;
|
||||
|
||||
impl ControlFileData {
|
||||
// Initialize an all-zeros ControlFileData struct
|
||||
pub fn new() -> ControlFileData {
|
||||
let controlfile: ControlFileData;
|
||||
|
||||
let b = [0u8; SIZEOF_CONTROLDATA];
|
||||
controlfile =
|
||||
unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
controlfile
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_pg_control(mut buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
|
||||
let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
|
||||
buf.copy_to_slice(&mut b);
|
||||
|
||||
let controlfile: ControlFileData;
|
||||
|
||||
// TODO: verify CRC
|
||||
let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC];
|
||||
data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
let expectedcrc = crc32c::crc32c(&data_without_crc);
|
||||
|
||||
controlfile = unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
if expectedcrc != controlfile.crc {
|
||||
anyhow::bail!(
|
||||
"invalid CRC in control file: expected {:08X}, was {:08X}",
|
||||
expectedcrc,
|
||||
controlfile.crc
|
||||
);
|
||||
}
|
||||
|
||||
Ok(controlfile)
|
||||
}
|
||||
|
||||
pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes {
|
||||
let b: [u8; SIZEOF_CONTROLDATA];
|
||||
|
||||
b = unsafe { std::mem::transmute::<ControlFileData, [u8; SIZEOF_CONTROLDATA]>(controlfile) };
|
||||
|
||||
// Recompute the CRC
|
||||
let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC];
|
||||
data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
let newcrc = crc32c::crc32c(&data_without_crc);
|
||||
|
||||
let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize);
|
||||
|
||||
buf.extend_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
buf.extend_from_slice(&newcrc.to_ne_bytes());
|
||||
// Fill the rest of the control file with zeros.
|
||||
buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
|
||||
|
||||
buf.into()
|
||||
}
|
||||
|
||||
pub fn encode_checkpoint(checkpoint: CheckPoint) -> Bytes {
|
||||
let b: [u8; SIZEOF_CHECKPOINT];
|
||||
b = unsafe { std::mem::transmute::<CheckPoint, [u8; SIZEOF_CHECKPOINT]>(checkpoint) };
|
||||
return Bytes::copy_from_slice(&b[..]);
|
||||
}
|
||||
|
||||
pub fn decode_checkpoint(mut buf: Bytes) -> Result<CheckPoint, anyhow::Error> {
|
||||
let mut b = [0u8; SIZEOF_CHECKPOINT];
|
||||
buf.copy_to_slice(&mut b);
|
||||
let checkpoint: CheckPoint;
|
||||
checkpoint = unsafe { std::mem::transmute::<[u8; SIZEOF_CHECKPOINT], CheckPoint>(b) };
|
||||
Ok(checkpoint)
|
||||
}
|
||||
|
||||
impl CheckPoint {
|
||||
pub fn new(lsn: u64, timeline: u32) -> CheckPoint {
|
||||
CheckPoint {
|
||||
redo: lsn,
|
||||
ThisTimeLineID: timeline,
|
||||
PrevTimeLineID: timeline,
|
||||
fullPageWrites: true, // TODO: get actual value of full_page_writes
|
||||
nextXid: FullTransactionId {
|
||||
value: pg_constants::FIRST_NORMAL_TRANSACTION_ID as u64,
|
||||
}, // TODO: handle epoch?
|
||||
nextOid: pg_constants::FIRST_BOOTSTRAP_OBJECT_ID,
|
||||
nextMulti: 1,
|
||||
nextMultiOffset: 0,
|
||||
oldestXid: pg_constants::FIRST_NORMAL_TRANSACTION_ID,
|
||||
oldestXidDB: 0,
|
||||
oldestMulti: 1,
|
||||
oldestMultiDB: 0,
|
||||
time: 0,
|
||||
oldestCommitTsXid: 0,
|
||||
newestCommitTsXid: 0,
|
||||
oldestActiveXid: pg_constants::INVALID_TRANSACTION_ID,
|
||||
}
|
||||
}
|
||||
}
|
||||
32
postgres_ffi/src/nonrelfile_utils.rs
Normal file
32
postgres_ffi/src/nonrelfile_utils.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
//!
|
||||
//! Common utilities for dealing with PostgreSQL non-relation files.
|
||||
//!
|
||||
use crate::pg_constants;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
|
||||
pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
|
||||
trace!(
|
||||
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
|
||||
status
|
||||
);
|
||||
|
||||
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
|
||||
let bshift: u8 =
|
||||
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
page[byteno] =
|
||||
(page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift);
|
||||
}
|
||||
|
||||
pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
|
||||
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
|
||||
let bshift: u8 =
|
||||
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
return ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8;
|
||||
}
|
||||
178
postgres_ffi/src/pg_constants.rs
Normal file
178
postgres_ffi/src/pg_constants.rs
Normal file
@@ -0,0 +1,178 @@
|
||||
//!
|
||||
//! Misc constants, copied from PostgreSQL headers.
|
||||
//!
|
||||
|
||||
//
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
pub const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
//
|
||||
// Fork numbers, from relpath.h
|
||||
//
|
||||
pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const FSM_FORKNUM: u8 = 1;
|
||||
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
// Special values for non-rel files' tags (Zenith-specific)
|
||||
//Special values for non-rel files' tags
|
||||
pub const PG_CONTROLFILE_FORKNUM: u8 = 42;
|
||||
pub const PG_FILENODEMAP_FORKNUM: u8 = 43;
|
||||
pub const PG_XACT_FORKNUM: u8 = 44;
|
||||
pub const PG_MXACT_OFFSETS_FORKNUM: u8 = 45;
|
||||
pub const PG_MXACT_MEMBERS_FORKNUM: u8 = 46;
|
||||
pub const PG_TWOPHASE_FORKNUM: u8 = 47;
|
||||
pub const PG_CHECKPOINT_FORKNUM: u8 = 48;
|
||||
|
||||
// From storage_xlog.h
|
||||
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
|
||||
|
||||
// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and
|
||||
// --with-segsize=SEGSIZE, but assume the defaults for now.
|
||||
pub const BLCKSZ: u16 = 8192;
|
||||
pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);
|
||||
|
||||
//
|
||||
// constants from clog.h
|
||||
//
|
||||
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
|
||||
pub const CLOG_XACTS_PER_PAGE: u32 = BLCKSZ as u32 * CLOG_XACTS_PER_BYTE;
|
||||
pub const CLOG_BITS_PER_XACT: u8 = 2;
|
||||
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
|
||||
|
||||
//
|
||||
// Constants from visbilitymap.h
|
||||
//
|
||||
pub const SIZE_OF_PAGE_HEADER: u16 = 24;
|
||||
pub const BITS_PER_HEAPBLOCK: u16 = 2;
|
||||
pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
|
||||
|
||||
pub const TRANSACTION_STATUS_IN_PROGRESS: u8 = 0x00;
|
||||
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
|
||||
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
|
||||
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
|
||||
|
||||
pub const CLOG_ZEROPAGE: u8 = 0x00;
|
||||
pub const CLOG_TRUNCATE: u8 = 0x10;
|
||||
|
||||
// From xact.h
|
||||
pub const XLOG_XACT_COMMIT: u8 = 0x00;
|
||||
pub const XLOG_XACT_PREPARE: u8 = 0x10;
|
||||
pub const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
|
||||
pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
|
||||
|
||||
// From srlu.h
|
||||
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
pub const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
|
||||
|
||||
/*
|
||||
* The following flags, stored in xinfo, determine which information is
|
||||
* contained in commit/abort records.
|
||||
*/
|
||||
pub const XACT_XINFO_HAS_DBINFO: u32 = 1u32 << 0;
|
||||
pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
|
||||
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
|
||||
pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
|
||||
pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
|
||||
// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
|
||||
// pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
|
||||
// pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;
|
||||
|
||||
// From pg_control.h and rmgrlist.h
|
||||
pub const XLOG_NEXTOID: u8 = 0x30;
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
|
||||
// From multixact.h
|
||||
pub const XLOG_MULTIXACT_ZERO_OFF_PAGE: u8 = 0x00;
|
||||
pub const XLOG_MULTIXACT_ZERO_MEM_PAGE: u8 = 0x10;
|
||||
pub const XLOG_MULTIXACT_CREATE_ID: u8 = 0x20;
|
||||
pub const XLOG_MULTIXACT_TRUNCATE_ID: u8 = 0x30;
|
||||
|
||||
pub const MULTIXACT_OFFSETS_PER_PAGE: u16 = BLCKSZ / 4;
|
||||
pub const MXACT_MEMBER_BITS_PER_XACT: u16 = 8;
|
||||
pub const MXACT_MEMBER_FLAGS_PER_BYTE: u16 = 1;
|
||||
pub const MULTIXACT_FLAGBYTES_PER_GROUP: u16 = 4;
|
||||
pub const MULTIXACT_MEMBERS_PER_MEMBERGROUP: u16 =
|
||||
MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE;
|
||||
/* size in bytes of a complete group */
|
||||
pub const MULTIXACT_MEMBERGROUP_SIZE: u16 =
|
||||
4 * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP;
|
||||
pub const MULTIXACT_MEMBERGROUPS_PER_PAGE: u16 = BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE;
|
||||
pub const MULTIXACT_MEMBERS_PER_PAGE: u16 =
|
||||
MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP;
|
||||
|
||||
// From heapam_xlog.h
|
||||
pub const XLOG_HEAP_INSERT: u8 = 0x00;
|
||||
pub const XLOG_HEAP_DELETE: u8 = 0x10;
|
||||
pub const XLOG_HEAP_UPDATE: u8 = 0x20;
|
||||
pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
|
||||
pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
|
||||
pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
|
||||
pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
|
||||
pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
|
||||
pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
|
||||
pub const RM_XLOG_ID: u8 = 0;
|
||||
pub const RM_XACT_ID: u8 = 1;
|
||||
pub const RM_SMGR_ID: u8 = 2;
|
||||
pub const RM_CLOG_ID: u8 = 3;
|
||||
pub const RM_DBASE_ID: u8 = 4;
|
||||
pub const RM_TBLSPC_ID: u8 = 5;
|
||||
pub const RM_MULTIXACT_ID: u8 = 6;
|
||||
pub const RM_RELMAP_ID: u8 = 7;
|
||||
pub const RM_STANDBY_ID: u8 = 8;
|
||||
pub const RM_HEAP2_ID: u8 = 9;
|
||||
pub const RM_HEAP_ID: u8 = 10;
|
||||
|
||||
// from xlogreader.h
|
||||
pub const XLR_INFO_MASK: u8 = 0x0F;
|
||||
pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
|
||||
|
||||
// from dbcommands_xlog.h
|
||||
pub const XLOG_DBASE_CREATE: u8 = 0x00;
|
||||
pub const XLOG_DBASE_DROP: u8 = 0x10;
|
||||
|
||||
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
|
||||
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
|
||||
|
||||
pub const SIZEOF_XLOGRECORD: u32 = 24;
|
||||
|
||||
//
|
||||
// from xlogrecord.h
|
||||
//
|
||||
pub const XLR_MAX_BLOCK_ID: u8 = 32;
|
||||
|
||||
pub const XLR_BLOCK_ID_DATA_SHORT: u8 = 255;
|
||||
pub const XLR_BLOCK_ID_DATA_LONG: u8 = 254;
|
||||
pub const XLR_BLOCK_ID_ORIGIN: u8 = 253;
|
||||
pub const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252;
|
||||
|
||||
pub const BKPBLOCK_FORK_MASK: u8 = 0x0F;
|
||||
pub const _BKPBLOCK_FLAG_MASK: u8 = 0xF0;
|
||||
pub const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */
|
||||
pub const BKPBLOCK_HAS_DATA: u8 = 0x20;
|
||||
pub const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */
|
||||
pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */
|
||||
|
||||
/* Information stored in bimg_info */
|
||||
pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
|
||||
pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
|
||||
pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
|
||||
|
||||
/* From transam.h */
|
||||
pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
|
||||
pub const INVALID_TRANSACTION_ID: u32 = 0;
|
||||
pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
|
||||
pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
|
||||
|
||||
/* FIXME: pageserver should request wal_seg_size from compute node */
|
||||
pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
|
||||
151
postgres_ffi/src/relfile_utils.rs
Normal file
151
postgres_ffi/src/relfile_utils.rs
Normal file
@@ -0,0 +1,151 @@
|
||||
//!
|
||||
//! Common utilities for dealing with PostgreSQL relation files.
|
||||
//!
|
||||
use crate::pg_constants;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
#[derive(Debug, Clone, thiserror::Error, PartialEq)]
|
||||
pub enum FilePathError {
|
||||
#[error("invalid relation fork name")]
|
||||
InvalidForkName,
|
||||
#[error("invalid relation data file name")]
|
||||
InvalidFileName,
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(_e: core::num::ParseIntError) -> Self {
|
||||
FilePathError::InvalidFileName
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert Postgres relation file's fork suffix to fork number.
|
||||
pub fn forkname_to_number(forkname: Option<&str>) -> Result<u8, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(pg_constants::MAIN_FORKNUM),
|
||||
Some("fsm") => Ok(pg_constants::FSM_FORKNUM),
|
||||
Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM),
|
||||
Some("init") => Ok(pg_constants::INIT_FORKNUM),
|
||||
Some(_) => Err(FilePathError::InvalidForkName),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert Postgres fork number to the right suffix of the relation data file.
|
||||
pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
match forknum {
|
||||
pg_constants::MAIN_FORKNUM => None,
|
||||
pg_constants::FSM_FORKNUM => Some("fsm"),
|
||||
pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"),
|
||||
pg_constants::INIT_FORKNUM => Some("init"),
|
||||
|
||||
// These should not appear in WAL records, but we use them internally,
|
||||
// and need to be prepared to print them out in log messages and such
|
||||
pg_constants::PG_CONTROLFILE_FORKNUM => Some("controlfile"),
|
||||
pg_constants::PG_FILENODEMAP_FORKNUM => Some("filenodemap"),
|
||||
pg_constants::PG_XACT_FORKNUM => Some("xact"),
|
||||
pg_constants::PG_MXACT_OFFSETS_FORKNUM => Some("mxact_offsets"),
|
||||
pg_constants::PG_MXACT_MEMBERS_FORKNUM => Some("mxact_members"),
|
||||
pg_constants::PG_TWOPHASE_FORKNUM => Some("twophase"),
|
||||
|
||||
_ => Some("UNKNOWN FORKNUM"),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
|
||||
///
|
||||
/// Formats:
|
||||
/// <oid>
|
||||
/// <oid>_<fork name>
|
||||
/// <oid>.<segment number>
|
||||
/// <oid>_<fork name>.<segment number>
|
||||
///
|
||||
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
|
||||
///
|
||||
pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
lazy_static! {
|
||||
static ref RELFILE_RE: Regex =
|
||||
Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
}
|
||||
let caps = RELFILE_RE
|
||||
.captures(fname)
|
||||
.ok_or(FilePathError::InvalidFileName)?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = relnode_str.parse::<u32>()?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = forkname_to_number(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
segno_match.unwrap().as_str().parse::<u32>()?
|
||||
};
|
||||
|
||||
Ok((relnode, forknum, segno))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_valid_relfilenames() {
|
||||
assert_eq!(parse_relfilename("1234"), Ok((1234, 0, 0)));
|
||||
assert_eq!(parse_relfilename("1234_fsm"), Ok((1234, 1, 0)));
|
||||
assert_eq!(parse_relfilename("1234_vm"), Ok((1234, 2, 0)));
|
||||
assert_eq!(parse_relfilename("1234_init"), Ok((1234, 3, 0)));
|
||||
|
||||
assert_eq!(parse_relfilename("1234.12"), Ok((1234, 0, 12)));
|
||||
assert_eq!(parse_relfilename("1234_fsm.12"), Ok((1234, 1, 12)));
|
||||
assert_eq!(parse_relfilename("1234_vm.12"), Ok((1234, 2, 12)));
|
||||
assert_eq!(parse_relfilename("1234_init.12"), Ok((1234, 3, 12)));
|
||||
|
||||
// relfilenode is unsigned, so it can go up to 2^32-1
|
||||
assert_eq!(parse_relfilename("3147483648"), Ok((3147483648, 0, 0)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_invalid_relfilenames() {
|
||||
assert_eq!(
|
||||
parse_relfilename("foo"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("1.2.3"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("1234_invalid"),
|
||||
Err(FilePathError::InvalidForkName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("1234_"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
|
||||
// too large for u32
|
||||
assert_eq!(
|
||||
parse_relfilename("12345678901"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("-1234"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_weird_relfilenames() {
|
||||
// we accept 0 for the relfilenode, but PostgreSQL should never do that.
|
||||
assert_eq!(parse_relfilename("0"), Ok((0, 0, 0)));
|
||||
|
||||
// PostgreSQL has a limit of 2^32-2 blocks in a table. With 8k block size and
|
||||
// 1 GB segments, the max segment number is 32767. But we accept larger values
|
||||
// currently.
|
||||
assert_eq!(parse_relfilename("1.123456"), Ok((1, 0, 123456)));
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,21 @@
|
||||
//
|
||||
// This file contains common utilities for dealing with PostgreSQL WAL files and
|
||||
// LSNs.
|
||||
//
|
||||
// Many of these functions have been copied from PostgreSQL, and rewritten in
|
||||
// Rust. That's why they don't follow the usual Rust naming conventions, they
|
||||
// have been named the same as the corresponding PostgreSQL functions instead.
|
||||
//
|
||||
|
||||
use crate::pg_constants;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, Bytes};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use std::fs::{self, File};
|
||||
use std::io::prelude::*;
|
||||
use std::path::PathBuf;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
@@ -16,24 +27,16 @@ pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = XLP_REM_LEN_OFFS + 4 + 4;
|
||||
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4;
|
||||
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
|
||||
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = XLOG_RECORD_CRC_OFFS + 4;
|
||||
pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
||||
|
||||
pub type XLogRecPtr = u64;
|
||||
pub type TimeLineID = u32;
|
||||
pub type TimestampTz = u64;
|
||||
pub type XLogSegNo = u64;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
|
||||
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
return xlogptr / wal_segsz_bytes as u64;
|
||||
(0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -42,7 +45,7 @@ pub fn XLogSegNoOffsetToRecPtr(
|
||||
offset: u32,
|
||||
wal_segsz_bytes: usize,
|
||||
) -> XLogRecPtr {
|
||||
return segno * (wal_segsz_bytes as u64) + (offset as u64);
|
||||
segno * (wal_segsz_bytes as u64) + (offset as u64)
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -60,7 +63,7 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin
|
||||
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
|
||||
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
|
||||
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
|
||||
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
|
||||
(log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -70,7 +73,7 @@ pub fn IsXLogFileName(fname: &str) -> bool {
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsPartialXLogFileName(fname: &str) -> bool {
|
||||
return fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]);
|
||||
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
|
||||
}
|
||||
|
||||
pub fn get_current_timestamp() -> TimestampTz {
|
||||
@@ -89,7 +92,7 @@ pub fn get_current_timestamp() -> TimestampTz {
|
||||
}
|
||||
|
||||
fn find_end_of_wal_segment(
|
||||
data_dir: &PathBuf,
|
||||
data_dir: &Path,
|
||||
segno: XLogSegNo,
|
||||
tli: TimeLineID,
|
||||
wal_seg_size: usize,
|
||||
@@ -181,11 +184,14 @@ fn find_end_of_wal_segment(
|
||||
}
|
||||
}
|
||||
}
|
||||
return last_valid_rec_pos as u32;
|
||||
last_valid_rec_pos as u32
|
||||
}
|
||||
|
||||
///
|
||||
/// Scan a directory that contains PostgreSQL WAL files, for the end of WAL.
|
||||
///
|
||||
pub fn find_end_of_wal(
|
||||
data_dir: &PathBuf,
|
||||
data_dir: &Path,
|
||||
wal_seg_size: usize,
|
||||
precise: bool,
|
||||
) -> (XLogRecPtr, TimeLineID) {
|
||||
@@ -237,7 +243,7 @@ pub fn find_end_of_wal(
|
||||
let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
|
||||
return (high_ptr, high_tli);
|
||||
}
|
||||
return (0, 0);
|
||||
(0, 0)
|
||||
}
|
||||
|
||||
pub fn main() {
|
||||
@@ -252,3 +258,39 @@ pub fn main() {
|
||||
tli
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// Xlog record parsing routines
|
||||
// TODO move here other related code from waldecoder.rs
|
||||
//
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XLogRecord {
|
||||
pub xl_tot_len: u32,
|
||||
pub xl_xid: u32,
|
||||
pub xl_prev: u64,
|
||||
pub xl_info: u8,
|
||||
pub xl_rmid: u8,
|
||||
pub xl_crc: u32,
|
||||
}
|
||||
|
||||
impl XLogRecord {
|
||||
pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
|
||||
XLogRecord {
|
||||
xl_tot_len: buf.get_u32_le(),
|
||||
xl_xid: buf.get_u32_le(),
|
||||
xl_prev: buf.get_u64_le(),
|
||||
xl_info: buf.get_u8(),
|
||||
xl_rmid: buf.get_u8(),
|
||||
xl_crc: {
|
||||
buf.advance(2);
|
||||
buf.get_u32_le()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Is this record an XLOG_SWITCH record? They need some special processing,
|
||||
pub fn is_xlog_switch_record(&self) -> bool {
|
||||
self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID
|
||||
}
|
||||
}
|
||||
92
test_runner/README.md
Normal file
92
test_runner/README.md
Normal file
@@ -0,0 +1,92 @@
|
||||
## Zenith test runner
|
||||
|
||||
This directory contains integration tests.
|
||||
|
||||
Prerequisites:
|
||||
- Python 3.6 or later
|
||||
- Python packages: pytest, psycopg2
|
||||
- pytest 6.0 is required.
|
||||
- __NOTE: `apt install` on Debian/Ubuntu won't work.__
|
||||
They ship a much older version of pytest (and sometimes rename it to
|
||||
`pytest-3`.)
|
||||
- Install using something like this:
|
||||
- `pip3 install pytest psycopg2` (Debian or Ubuntu)
|
||||
- Zenith and Postgres binaries
|
||||
- See the root README.md for build directions
|
||||
- Tests can be run from the git tree; or see the environment variables
|
||||
below to run from other directories.
|
||||
- The zenith git repo, including the postgres submodule
|
||||
(for some tests, e.g. pg_regress)
|
||||
|
||||
### Test Organization
|
||||
|
||||
The tests are divided into a few batches, such that each batch takes roughly
|
||||
the same amount of time. The batches can be run in parallel, to minimize total
|
||||
runtime. Currently, there are only two batches:
|
||||
|
||||
- test_batch_pg_regress: Runs PostgreSQL regression tests
|
||||
- test_others: All other tests
|
||||
|
||||
### Running the tests
|
||||
|
||||
Because pytest will search all subdirectories for tests, it's easiest to
|
||||
run the tests from within the `test_runner` directory.
|
||||
|
||||
Test state (postgres data, pageserver state, and log files) will
|
||||
be stored under a directory `test_output`.
|
||||
|
||||
You can run all the tests with:
|
||||
|
||||
`pytest`
|
||||
|
||||
If you want to run all the tests in a particular file:
|
||||
|
||||
`pytest test_pgbench.py`
|
||||
|
||||
If you want to run all tests that have the string "bench" in their names:
|
||||
|
||||
`pytest -k bench`
|
||||
|
||||
Useful environment variables:
|
||||
|
||||
`ZENITH_BIN`: The directory where zenith binaries can be found.
|
||||
`POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
|
||||
`TEST_OUTPUT`: Set the directory where test state and test output files
|
||||
should go.
|
||||
`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
|
||||
|
||||
Let stdout and stderr go to the terminal instead of capturing them:
|
||||
`pytest -s ...`
|
||||
(Note many tests capture subprocess outputs separately, so this may not
|
||||
show much.)
|
||||
|
||||
Exit after the first test failure:
|
||||
`pytest -x ...`
|
||||
(there are many more pytest options; run `pytest -h` to see them.)
|
||||
|
||||
|
||||
### Building new tests
|
||||
|
||||
The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
|
||||
|
||||
Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
|
||||
|
||||
So this code:
|
||||
```
|
||||
def test_something(zenith_cli, pg_bin):
|
||||
pass
|
||||
```
|
||||
|
||||
... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
|
||||
|
||||
Fixtures can't be imported using the normal python syntax. Instead, use this:
|
||||
```
|
||||
pytest_plugins = ("fixtures.something")
|
||||
```
|
||||
That will make all the fixtures in the `fixtures/something.py` file available.
|
||||
|
||||
Anything that's likely to be used in multiple tests should be built into a fixture.
|
||||
|
||||
Note that fixtures can clean up after themselves if they use the `yield` syntax.
|
||||
Cleanup will happen even if the test fails (raises an unhandled exception).
|
||||
Python destructors, e.g. `__del__()` aren't recommended for cleanup.
|
||||
67
test_runner/batch_others/test_branch_behind.py
Normal file
67
test_runner/batch_others/test_branch_behind.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import pytest
|
||||
import getpass
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Create a couple of branches off the main branch, at a historical point in time.
|
||||
#
|
||||
def test_branch_behind(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind", "empty"]);
|
||||
|
||||
pgmain = postgres.create_start('test_branch_behind')
|
||||
print("postgres is running on 'test_branch_behind' branch")
|
||||
|
||||
main_pg_conn = psycopg2.connect(pgmain.connstr());
|
||||
main_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
main_cur = main_pg_conn.cursor()
|
||||
|
||||
# Create table, and insert the first 100 rows
|
||||
main_cur.execute('CREATE TABLE foo (t text)');
|
||||
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100) g");
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn_a = main_cur.fetchone()[0]
|
||||
print('LSN after 100 rows: ' + lsn_a)
|
||||
|
||||
# Insert some more rows. (This generates enough WAL to fill a few segments.)
|
||||
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g");
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
print('LSN after 100100 rows: ' + lsn_b)
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@"+lsn_a]);
|
||||
|
||||
# Insert many more rows. This generates enough WAL to fill a few segments.
|
||||
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g");
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
print('LSN after 200100 rows: ' + lsn_c)
|
||||
|
||||
# Branch at the point where only 200 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@"+lsn_b]);
|
||||
|
||||
pg_hundred = postgres.create_start("test_branch_behind_hundred")
|
||||
pg_more = postgres.create_start("test_branch_behind_more")
|
||||
|
||||
# On the 'hundred' branch, we should see only 100 rows
|
||||
hundred_pg_conn = psycopg2.connect(pg_hundred.connstr())
|
||||
hundred_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
hundred_cur = hundred_pg_conn.cursor()
|
||||
hundred_cur.execute('SELECT count(*) FROM foo');
|
||||
assert(hundred_cur.fetchone()[0] == 100);
|
||||
|
||||
# On the 'more' branch, we should see 100200 rows
|
||||
more_pg_conn = psycopg2.connect(pg_more.connstr())
|
||||
more_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
more_cur = more_pg_conn.cursor()
|
||||
more_cur.execute('SELECT count(*) FROM foo');
|
||||
assert(more_cur.fetchone()[0] == 100100);
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo');
|
||||
assert(main_cur.fetchone()[0] == 200100);
|
||||
30
test_runner/batch_others/test_config.py
Normal file
30
test_runner/batch_others/test_config.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import pytest
|
||||
import os
|
||||
import getpass
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test starting Postgres with custom options
|
||||
#
|
||||
def test_config(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_config", "empty"]);
|
||||
|
||||
# change config
|
||||
pg = postgres.create_start('test_config', ['log_min_messages=debug1'])
|
||||
print('postgres is running on test_config branch')
|
||||
|
||||
pg_conn = psycopg2.connect(pg.connstr())
|
||||
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
#check that config change was applied
|
||||
cur.execute('SELECT name, setting from pg_settings WHERE source!=%s and source!=%s', ("default","override",))
|
||||
for record in cur:
|
||||
if record[0] == 'log_min_messages':
|
||||
assert(record[1] == 'debug1')
|
||||
|
||||
pg_conn.close()
|
||||
37
test_runner/batch_others/test_createdb.py
Normal file
37
test_runner/batch_others/test_createdb.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import pytest
|
||||
import getpass
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Test CREATE DATABASE when there have been relmapper changes
|
||||
#
|
||||
def test_createdb(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_createdb", "empty"]);
|
||||
|
||||
pg = postgres.create_start('test_createdb')
|
||||
print("postgres is running on 'test_createdb' branch")
|
||||
|
||||
conn = psycopg2.connect(pg.connstr());
|
||||
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Cause a 'relmapper' change in the original branch
|
||||
cur.execute('VACUUM FULL pg_class');
|
||||
|
||||
cur.execute('CREATE DATABASE foodb');
|
||||
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn = cur.fetchone()[0]
|
||||
|
||||
conn.close();
|
||||
|
||||
# Create a branch
|
||||
zenith_cli.run(["branch", "test_createdb2", "test_createdb@"+lsn]);
|
||||
|
||||
pg2 = postgres.create_start('test_createdb2')
|
||||
|
||||
# Test that you can connect to the new database on both branches
|
||||
conn = psycopg2.connect(pg.connstr('foodb'));
|
||||
conn2 = psycopg2.connect(pg2.connstr('foodb'));
|
||||
54
test_runner/batch_others/test_pageserver_api.py
Normal file
54
test_runner/batch_others/test_pageserver_api.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import pytest
|
||||
import psycopg2
|
||||
import getpass
|
||||
import json
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def test_status(pageserver):
|
||||
pg_conn = psycopg2.connect(pageserver.connstr())
|
||||
pg_conn.autocommit = True
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute('status;')
|
||||
assert cur.fetchone() == ('hello world',)
|
||||
pg_conn.close()
|
||||
|
||||
def test_branch_list(pageserver, zenith_cli):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_branch_list_main", "empty"]);
|
||||
|
||||
page_server_conn = psycopg2.connect(pageserver.connstr())
|
||||
page_server_conn.autocommit = True
|
||||
page_server_cur = page_server_conn.cursor()
|
||||
|
||||
page_server_cur.execute('branch_list;')
|
||||
branches = json.loads(page_server_cur.fetchone()[0])
|
||||
# Filter out branches created by other tests
|
||||
branches = [x for x in branches if x['name'].startswith('test_branch_list')]
|
||||
|
||||
assert len(branches) == 1
|
||||
assert branches[0]['name'] == 'test_branch_list_main'
|
||||
assert 'timeline_id' in branches[0]
|
||||
assert 'latest_valid_lsn' in branches[0]
|
||||
assert 'ancestor_id' in branches[0]
|
||||
assert 'ancestor_lsn' in branches[0]
|
||||
|
||||
# Create another branch, and start Postgres on it
|
||||
zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
|
||||
zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
|
||||
|
||||
page_server_cur.execute('branch_list;')
|
||||
new_branches = json.loads(page_server_cur.fetchone()[0])
|
||||
# Filter out branches created by other tests
|
||||
new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
|
||||
assert len(new_branches) == 2
|
||||
new_branches.sort(key=lambda k: k['name'])
|
||||
|
||||
assert new_branches[0]['name'] == 'test_branch_list_experimental'
|
||||
assert new_branches[0]['timeline_id'] != branches[0]['timeline_id']
|
||||
|
||||
# TODO: do the LSNs have to match here?
|
||||
assert new_branches[1] == branches[0]
|
||||
|
||||
page_server_conn.close()
|
||||
17
test_runner/batch_others/test_pgbench.py
Normal file
17
test_runner/batch_others/test_pgbench.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import pytest
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_pgbench(pageserver, postgres, pg_bin, zenith_cli):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_pgbench", "empty"]);
|
||||
|
||||
pg = postgres.create_start('test_pgbench')
|
||||
print("postgres is running on 'test_pgbench' branch")
|
||||
|
||||
connstr = pg.connstr();
|
||||
|
||||
pg_bin.run_capture(['pgbench', '-i', connstr])
|
||||
pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr])
|
||||
58
test_runner/batch_others/test_twophase.py
Normal file
58
test_runner/batch_others/test_twophase.py
Normal file
@@ -0,0 +1,58 @@
|
||||
#
|
||||
# Test branching, when a transaction is in prepared state
|
||||
#
|
||||
import pytest
|
||||
import getpass
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def test_twophase(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_twophase", "empty"]);
|
||||
|
||||
pg = postgres.create_start('test_twophase', ['max_prepared_transactions=5'])
|
||||
print("postgres is running on 'test_twophase' branch")
|
||||
|
||||
conn = psycopg2.connect(pg.connstr());
|
||||
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute('CREATE TABLE foo (t text)');
|
||||
|
||||
# Prepare a transaction that will insert a row
|
||||
cur.execute('BEGIN');
|
||||
cur.execute("INSERT INTO foo VALUES ('one')");
|
||||
cur.execute("PREPARE TRANSACTION 'insert_one'");
|
||||
|
||||
# Prepare another transaction that will insert a row
|
||||
cur.execute('BEGIN');
|
||||
cur.execute("INSERT INTO foo VALUES ('two')");
|
||||
cur.execute("PREPARE TRANSACTION 'insert_two'");
|
||||
|
||||
cur.execute('BEGIN');
|
||||
cur.execute("INSERT INTO foo VALUES ('three')");
|
||||
cur.execute("PREPARE TRANSACTION 'insert_three'");
|
||||
cur.execute("COMMIT PREPARED 'insert_three'");
|
||||
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn = cur.fetchone()[0]
|
||||
|
||||
# Create a branch with the transaction in prepared state
|
||||
zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase@"+lsn]);
|
||||
|
||||
pg2 = postgres.create_start('test_twophase_prepared', ['max_prepared_transactions=5'])
|
||||
conn2 = psycopg2.connect(pg2.connstr());
|
||||
conn2.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur2 = conn2.cursor()
|
||||
|
||||
# On the new branch, commit one of the prepared transactions, abort the other one.
|
||||
cur2.execute("COMMIT PREPARED 'insert_one'");
|
||||
cur2.execute("ROLLBACK PREPARED 'insert_two'");
|
||||
|
||||
cur2.execute('SELECT * FROM foo');
|
||||
assert(cur2.fetchall() == [('one',),('three',)]);
|
||||
|
||||
# Neither insert is visible on the original branch, the transactions are still
|
||||
# in prepared state there.
|
||||
cur.execute('SELECT * FROM foo');
|
||||
assert(cur.fetchall() == [('three',)]);
|
||||
49
test_runner/batch_others/test_zenith_cli.py
Normal file
49
test_runner/batch_others/test_zenith_cli.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import pytest
|
||||
import psycopg2
|
||||
import json
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def helper_compare_branch_list(page_server_cur, zenith_cli):
|
||||
"""
|
||||
Compare branches list returned by CLI and directly via API.
|
||||
Filters out branches created by other tests.
|
||||
"""
|
||||
|
||||
page_server_cur.execute('branch_list;')
|
||||
branches_api = sorted(map(lambda b: b['name'], json.loads(page_server_cur.fetchone()[0])))
|
||||
branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]
|
||||
|
||||
res = zenith_cli.run(["branch"]);
|
||||
assert(res.stderr == '')
|
||||
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
|
||||
branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
|
||||
|
||||
assert(branches_api == branches_cli)
|
||||
|
||||
def test_cli_branch_list(pageserver, zenith_cli):
|
||||
|
||||
page_server_conn = psycopg2.connect(pageserver.connstr())
|
||||
page_server_conn.autocommit = True
|
||||
page_server_cur = page_server_conn.cursor()
|
||||
|
||||
# Initial sanity check
|
||||
helper_compare_branch_list(page_server_cur, zenith_cli)
|
||||
|
||||
# Create a branch for us
|
||||
res = zenith_cli.run(["branch", "test_cli_branch_list_main", "main"]);
|
||||
assert(res.stderr == '')
|
||||
helper_compare_branch_list(page_server_cur, zenith_cli)
|
||||
|
||||
# Create a nested branch
|
||||
res = zenith_cli.run(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"]);
|
||||
assert(res.stderr == '')
|
||||
helper_compare_branch_list(page_server_cur, zenith_cli)
|
||||
|
||||
# Check that all new branches are visible via CLI
|
||||
res = zenith_cli.run(["branch"]);
|
||||
assert(res.stderr == '')
|
||||
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
|
||||
|
||||
assert('test_cli_branch_list_main' in branches_cli)
|
||||
assert('test_cli_branch_list_nested' in branches_cli)
|
||||
61
test_runner/batch_pg_regress/test_pg_regress.py
Normal file
61
test_runner/batch_pg_regress/test_pg_regress.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import pytest
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
import getpass
|
||||
import os
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
# FIXME: put host + port in a fixture
|
||||
HOST = 'localhost'
|
||||
PORT = 55432
|
||||
|
||||
|
||||
def test_pg_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, base_dir, capsys):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_pg_regress", "empty"]);
|
||||
|
||||
# Connect to postgres and create a database called "regression".
|
||||
pg = postgres.create_start('test_pg_regress')
|
||||
pg_conn = psycopg2.connect(pg.connstr())
|
||||
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute('CREATE DATABASE regression')
|
||||
pg_conn.close()
|
||||
|
||||
# Create some local directories for pg_regress to run in.
|
||||
runpath = os.path.join(test_output_dir, 'regress')
|
||||
mkdir_if_needed(runpath)
|
||||
mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
|
||||
|
||||
# Compute all the file locations that pg_regress will need.
|
||||
build_path = os.path.join(
|
||||
pg_distrib_dir, 'build/src/test/regress')
|
||||
src_path = os.path.join(
|
||||
base_dir, 'vendor/postgres/src/test/regress')
|
||||
bindir = os.path.join(pg_distrib_dir, 'bin')
|
||||
schedule = os.path.join(src_path, 'parallel_schedule')
|
||||
pg_regress = os.path.join(build_path, 'pg_regress')
|
||||
|
||||
pg_regress_command = [
|
||||
pg_regress,
|
||||
'--bindir=""',
|
||||
'--use-existing',
|
||||
'--bindir={}'.format(bindir),
|
||||
'--dlpath={}'.format(build_path),
|
||||
'--schedule={}'.format(schedule),
|
||||
'--inputdir={}'.format(src_path),
|
||||
]
|
||||
|
||||
env = {
|
||||
'PGPORT': str(pg.port),
|
||||
'PGUSER': pg.username,
|
||||
'PGHOST': pg.host,
|
||||
}
|
||||
|
||||
# Run the command.
|
||||
# We don't capture the output. It's not too chatty, and it always
|
||||
# logs the exact same data to `regression.out` anyway.
|
||||
with capsys.disabled():
|
||||
pg_bin.run(pg_regress_command, env=env, cwd=runpath)
|
||||
62
test_runner/batch_pg_regress/test_zehith_regress.py
Normal file
62
test_runner/batch_pg_regress/test_zehith_regress.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import pytest
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
import getpass
|
||||
import os
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
# FIXME: put host + port in a fixture
|
||||
HOST = 'localhost'
|
||||
PORT = 55432
|
||||
|
||||
|
||||
def test_zenith_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, base_dir, capsys):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_zenith_regress", "empty"]);
|
||||
|
||||
# Connect to postgres and create a database called "regression".
|
||||
pg = postgres.create_start('test_zenith_regress')
|
||||
pg_conn = psycopg2.connect(pg.connstr())
|
||||
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute('CREATE DATABASE regression')
|
||||
pg_conn.close()
|
||||
|
||||
# Create some local directories for pg_regress to run in.
|
||||
runpath = os.path.join(test_output_dir, 'regress')
|
||||
mkdir_if_needed(runpath)
|
||||
mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
|
||||
|
||||
# Compute all the file locations that pg_regress will need.
|
||||
# This test runs zenith specific tests
|
||||
build_path = os.path.join(
|
||||
pg_distrib_dir, 'build/src/test/regress')
|
||||
src_path = os.path.join(
|
||||
base_dir, 'test_runner/zenith_regress')
|
||||
bindir = os.path.join(pg_distrib_dir, 'bin')
|
||||
schedule = os.path.join(src_path, 'parallel_schedule')
|
||||
pg_regress = os.path.join(build_path, 'pg_regress')
|
||||
|
||||
pg_regress_command = [
|
||||
pg_regress,
|
||||
'--use-existing',
|
||||
'--bindir={}'.format(bindir),
|
||||
'--dlpath={}'.format(build_path),
|
||||
'--schedule={}'.format(schedule),
|
||||
'--inputdir={}'.format(src_path),
|
||||
]
|
||||
|
||||
print(pg_regress_command)
|
||||
env = {
|
||||
'PGPORT': str(pg.port),
|
||||
'PGUSER': pg.username,
|
||||
'PGHOST': pg.host,
|
||||
}
|
||||
|
||||
# Run the command.
|
||||
# We don't capture the output. It's not too chatty, and it always
|
||||
# logs the exact same data to `regression.out` anyway.
|
||||
with capsys.disabled():
|
||||
pg_bin.run(pg_regress_command, env=env, cwd=runpath)
|
||||
1
test_runner/conftest.py
Normal file
1
test_runner/conftest.py
Normal file
@@ -0,0 +1 @@
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
0
test_runner/fixtures/__init__.py
Normal file
0
test_runner/fixtures/__init__.py
Normal file
53
test_runner/fixtures/utils.py
Normal file
53
test_runner/fixtures/utils.py
Normal file
@@ -0,0 +1,53 @@
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
def get_self_dir():
|
||||
""" Get the path to the directory where this script lives. """
|
||||
return os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
def mkdir_if_needed(path):
|
||||
""" Create a directory if it doesn't already exist
|
||||
|
||||
Note this won't try to create intermediate directories.
|
||||
"""
|
||||
if os.path.exists(path):
|
||||
assert os.path.isdir(path)
|
||||
return
|
||||
os.mkdir(path)
|
||||
|
||||
|
||||
def subprocess_capture(capture_dir, cmd, **kwargs):
|
||||
""" Run a process and capture its output
|
||||
|
||||
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
|
||||
where "cmd" is the name of the program and NNN is an incrementing
|
||||
counter.
|
||||
|
||||
If those files already exist, we will overwrite them.
|
||||
"""
|
||||
assert type(cmd) is list
|
||||
base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
|
||||
basepath = os.path.join(capture_dir, base)
|
||||
stdout_filename = basepath + '.stdout'
|
||||
stderr_filename = basepath + '.stderr'
|
||||
|
||||
with open(stdout_filename, 'w') as stdout_f:
|
||||
with open(stderr_filename, 'w') as stderr_f:
|
||||
print('(capturing output to "{}.stdout")'.format(base))
|
||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||
|
||||
|
||||
_global_counter = 0
|
||||
|
||||
|
||||
def global_counter():
|
||||
""" A really dumb global counter.
|
||||
|
||||
This is useful for giving output files a unique number, so if we run the
|
||||
same command multiple times we can keep their output separate.
|
||||
"""
|
||||
global _global_counter
|
||||
_global_counter += 1
|
||||
return _global_counter
|
||||
357
test_runner/fixtures/zenith_fixtures.py
Normal file
357
test_runner/fixtures/zenith_fixtures.py
Normal file
@@ -0,0 +1,357 @@
|
||||
import getpass
|
||||
import os
|
||||
import psycopg2
|
||||
import pytest
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from .utils import (get_self_dir, mkdir_if_needed,
|
||||
subprocess_capture, global_counter)
|
||||
|
||||
"""
|
||||
This file contains pytest fixtures. A fixture is a test resource that can be
|
||||
summoned by placing its name in the test's arguments.
|
||||
|
||||
A fixture is created with the decorator @zenfixture, which is a wrapper around
|
||||
the standard pytest.fixture with some extra behavior.
|
||||
|
||||
There are several environment variables that can control the running of tests:
|
||||
ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information.
|
||||
|
||||
To use fixtures in a test file, add this line of code:
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
Don't import functions from this file, or pytest will emit warnings. Instead
|
||||
put directly-importable functions into utils.py or another separate file.
|
||||
"""
|
||||
|
||||
DEFAULT_OUTPUT_DIR = 'test_output'
|
||||
DEFAULT_POSTGRES_DIR = 'tmp_install'
|
||||
|
||||
|
||||
def determine_scope(fixture_name, config):
|
||||
return 'session'
|
||||
|
||||
|
||||
def zenfixture(func):
|
||||
""" This is a python decorator for fixtures with a flexible scope.
|
||||
|
||||
By default every test function will set up and tear down a new
|
||||
database. In pytest, this is called fixtures "function" scope.
|
||||
|
||||
If the environment variable TEST_SHARED_FIXTURES is set, then all
|
||||
tests will share the same database. State, logs, etc. will be
|
||||
stored in a directory called "shared".
|
||||
|
||||
"""
|
||||
if os.environ.get('TEST_SHARED_FIXTURES') is None:
|
||||
scope = 'function'
|
||||
else:
|
||||
scope = 'session'
|
||||
return pytest.fixture(func, scope=scope)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True, scope='session')
|
||||
def safety_check():
|
||||
""" Ensure that no unwanted daemons are running before we start testing. """
|
||||
# does not use -c as it is not supported on macOS
|
||||
cmd = ['pgrep', 'pageserver|postgres|wal_acceptor']
|
||||
result = subprocess.run(cmd, stdout=subprocess.DEVNULL)
|
||||
if result.returncode == 0:
|
||||
# returncode of 0 means it found something.
|
||||
# This is bad; we don't want any of those processes polluting the
|
||||
# result of the test.
|
||||
raise Exception('found interfering processes running')
|
||||
|
||||
|
||||
class ZenithCli:
|
||||
""" An object representing the CLI binary named "zenith".
|
||||
|
||||
We also store an environment that will tell the CLI to operate
|
||||
on a particular ZENITH_REPO_DIR.
|
||||
"""
|
||||
|
||||
def __init__(self, binpath, repo_dir, pg_distrib_dir):
|
||||
assert os.path.isdir(binpath)
|
||||
self.binpath = binpath
|
||||
self.bin_zenith = os.path.join(binpath, 'zenith')
|
||||
self.env = os.environ.copy()
|
||||
self.env['ZENITH_REPO_DIR'] = repo_dir
|
||||
self.env['POSTGRES_DISTRIB_DIR'] = pg_distrib_dir
|
||||
|
||||
def run(self, arguments):
|
||||
""" Run "zenith" with the specified arguments.
|
||||
|
||||
arguments must be in list form, e.g. ['pg', 'create']
|
||||
|
||||
Return both stdout and stderr, which can be accessed as
|
||||
|
||||
result = zenith_cli.run(...)
|
||||
assert(result.stderr == "")
|
||||
print(result.stdout)
|
||||
|
||||
"""
|
||||
assert type(arguments) == list
|
||||
args = [self.bin_zenith] + arguments
|
||||
print('Running command "{}"'.format(' '.join(args)))
|
||||
return subprocess.run(args, env=self.env, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
|
||||
|
||||
@zenfixture
|
||||
def zenith_cli(zenith_binpath, repo_dir, pg_distrib_dir):
|
||||
return ZenithCli(zenith_binpath, repo_dir, pg_distrib_dir)
|
||||
|
||||
|
||||
class ZenithPageserver:
|
||||
""" An object representing a running pageserver. """
|
||||
|
||||
def __init__(self, zenith_cli):
|
||||
self.zenith_cli = zenith_cli
|
||||
self.running = False
|
||||
|
||||
# Initialize the repository, i.e. run "zenith init"
|
||||
def init(self):
|
||||
self.zenith_cli.run(['init'])
|
||||
|
||||
# Start the page server
|
||||
def start(self):
|
||||
self.zenith_cli.run(['start'])
|
||||
self.running = True
|
||||
|
||||
# Stop the page server
|
||||
def stop(self):
|
||||
self.zenith_cli.run(['stop'])
|
||||
self.running = True
|
||||
|
||||
# The page server speaks the Postgres FE/BE protocol, so you can connect
|
||||
# to it with any Postgres client, and run special commands. This function
|
||||
# returns a libpq connection string for connecting to it.
|
||||
def connstr(self):
|
||||
username = getpass.getuser()
|
||||
conn_str = 'host={} port={} dbname=postgres user={}'.format(
|
||||
'localhost', 64000, username)
|
||||
return conn_str
|
||||
|
||||
# The 'pageserver' fixture provides a Page Server that's up and running.
|
||||
#
|
||||
# If TEST_SHARED_FIXTURES is set, the Page Server instance is shared by all
|
||||
# the tests. To avoid clashing with other tests, don't use the 'main' branch in
|
||||
# the tests directly. Instead, create a branch off the 'empty' branch and use
|
||||
# that.
|
||||
#
|
||||
# By convention, the test branches are named after the tests. For example,
|
||||
# test called 'test_foo' would create and use branches with the 'test_foo' prefix.
|
||||
@zenfixture
|
||||
def pageserver(zenith_cli):
|
||||
ps = ZenithPageserver(zenith_cli)
|
||||
ps.init()
|
||||
ps.start()
|
||||
# For convenience in tests, create a branch from the freshly-initialized cluster.
|
||||
zenith_cli.run(["branch", "empty", "main"]);
|
||||
yield ps
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting pageserver cleanup')
|
||||
ps.stop()
|
||||
|
||||
class Postgres:
|
||||
""" An object representing a running postgres daemon. """
|
||||
|
||||
def __init__(self, zenith_cli, repo_dir, instance_num):
|
||||
self.zenith_cli = zenith_cli
|
||||
self.instance_num = instance_num
|
||||
self.running = False
|
||||
self.username = getpass.getuser()
|
||||
self.host = 'localhost'
|
||||
self.port = 55431 + instance_num
|
||||
self.repo_dir = repo_dir
|
||||
self.branch = None
|
||||
# path to conf is <repo_dir>/pgdatadirs/<branch_name>/postgresql.conf
|
||||
|
||||
def create_start(self, branch, config_lines=None):
|
||||
""" create the pg data directory, and start the server """
|
||||
self.zenith_cli.run(['pg', 'create', branch])
|
||||
self.branch = branch
|
||||
if config_lines is None:
|
||||
config_lines = []
|
||||
self.config(config_lines)
|
||||
self.zenith_cli.run(['pg', 'start', branch])
|
||||
self.running = True
|
||||
return
|
||||
|
||||
#lines should be an array of valid postgresql.conf rows
|
||||
def config(self, lines):
|
||||
filename = 'pgdatadirs/{}/postgresql.conf'.format(self.branch)
|
||||
config_name = os.path.join(self.repo_dir, filename)
|
||||
with open(config_name, 'a') as conf:
|
||||
for line in lines:
|
||||
conf.write(line)
|
||||
conf.write('\n')
|
||||
|
||||
def stop(self):
|
||||
if self.running:
|
||||
self.zenith_cli.run(['pg', 'stop', self.branch])
|
||||
|
||||
# Return a libpq connection string to connect to the Postgres instance
|
||||
def connstr(self, dbname='postgres'):
|
||||
conn_str = 'host={} port={} dbname={} user={}'.format(
|
||||
self.host, self.port, dbname, self.username)
|
||||
return conn_str
|
||||
|
||||
class PostgresFactory:
|
||||
""" An object representing multiple running postgres daemons. """
|
||||
def __init__(self, zenith_cli, repo_dir):
|
||||
self.zenith_cli = zenith_cli
|
||||
self.host = 'localhost'
|
||||
self.repo_dir = repo_dir
|
||||
self.num_instances = 0
|
||||
self.instances = []
|
||||
|
||||
def create_start(self, branch="main", config_lines=None):
|
||||
pg = Postgres(self.zenith_cli, self.repo_dir, self.num_instances + 1)
|
||||
self.num_instances += 1
|
||||
self.instances.append(pg)
|
||||
pg.create_start(branch, config_lines)
|
||||
return pg
|
||||
|
||||
def stop_all(self):
|
||||
for pg in self.instances:
|
||||
pg.stop()
|
||||
|
||||
@zenfixture
|
||||
def postgres(zenith_cli, repo_dir):
|
||||
pgfactory = PostgresFactory(zenith_cli, repo_dir)
|
||||
yield pgfactory
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting postgres cleanup')
|
||||
pgfactory.stop_all()
|
||||
|
||||
|
||||
class PgBin:
|
||||
""" A helper class for executing postgres binaries """
|
||||
|
||||
def __init__(self, log_dir, pg_distrib_dir):
|
||||
self.log_dir = log_dir
|
||||
self.pg_install_path = pg_distrib_dir
|
||||
self.pg_bin_path = os.path.join(self.pg_install_path, 'bin')
|
||||
self.env = os.environ.copy()
|
||||
self.env['LD_LIBRARY_PATH'] = os.path.join(self.pg_install_path, 'lib')
|
||||
|
||||
def _fixpath(self, command):
|
||||
if not '/' in command[0]:
|
||||
command[0] = os.path.join(self.pg_bin_path, command[0])
|
||||
|
||||
def _build_env(self, env_add):
|
||||
if env_add is None:
|
||||
return self.env
|
||||
env = self.env.copy()
|
||||
env.update(env_add)
|
||||
return env
|
||||
|
||||
def run(self, command, env=None, cwd=None):
|
||||
""" Run one of the postgres binaries.
|
||||
|
||||
The command should be in list form, e.g. ['pgbench', '-p', '55432']
|
||||
|
||||
All the necessary environment variables will be set.
|
||||
|
||||
If the first argument (the command name) doesn't include a path (no '/'
|
||||
characters present), then it will be edited to include the correct path.
|
||||
|
||||
If you want stdout/stderr captured to files, use `run_capture` instead.
|
||||
|
||||
"""
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
subprocess.run(command, env=env, cwd=cwd, check=True)
|
||||
|
||||
def run_capture(self, command, env=None, cwd=None):
|
||||
""" Run one of the postgres binaries, with stderr and stdout redirected to a file.
|
||||
|
||||
This is just like `run`, but for chatty programs.
|
||||
"""
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True)
|
||||
|
||||
|
||||
@zenfixture
|
||||
def pg_bin(test_output_dir, pg_distrib_dir):
|
||||
return PgBin(test_output_dir, pg_distrib_dir)
|
||||
|
||||
|
||||
@zenfixture
|
||||
def base_dir():
|
||||
""" find the base directory (currently this is the git root) """
|
||||
base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..'))
|
||||
print('base_dir is', base_dir)
|
||||
return base_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def top_output_dir(base_dir):
|
||||
""" Compute the top-level directory for all tests. """
|
||||
env_test_output = os.environ.get('TEST_OUTPUT')
|
||||
if env_test_output is not None:
|
||||
output_dir = env_test_output
|
||||
else:
|
||||
output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
|
||||
mkdir_if_needed(output_dir)
|
||||
return output_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def test_output_dir(request, top_output_dir):
|
||||
""" Compute the working directory for an individual test. """
|
||||
if os.environ.get('TEST_SHARED_FIXTURES') is None:
|
||||
# one directory per test
|
||||
test_name = request.node.name
|
||||
else:
|
||||
# We're running shared fixtures. Share a single directory.
|
||||
test_name = 'shared'
|
||||
|
||||
test_output_dir = os.path.join(top_output_dir, test_name)
|
||||
print('test_output_dir is', test_output_dir)
|
||||
shutil.rmtree(test_output_dir, ignore_errors=True)
|
||||
mkdir_if_needed(test_output_dir)
|
||||
return test_output_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def repo_dir(request, test_output_dir):
|
||||
""" Compute the test repo_dir
|
||||
|
||||
"repo_dir" is the place where all of the pageserver files will go.
|
||||
It doesn't have anything to do with the git repo.
|
||||
"""
|
||||
repo_dir = os.path.join(test_output_dir, 'repo')
|
||||
return repo_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def zenith_binpath(base_dir):
|
||||
""" find the zenith binaries """
|
||||
env_zenith_bin = os.environ.get('ZENITH_BIN')
|
||||
if env_zenith_bin:
|
||||
zenith_dir = env_zenith_bin
|
||||
else:
|
||||
zenith_dir = os.path.join(base_dir, 'target/debug')
|
||||
if not os.path.exists(os.path.join(zenith_dir, 'pageserver')):
|
||||
raise Exception('zenith binaries not found at "{}"'.format(zenith_dir))
|
||||
return zenith_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def pg_distrib_dir(base_dir):
|
||||
""" find the postgress install """
|
||||
env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR')
|
||||
if env_postgres_bin:
|
||||
pg_dir = env_postgres_bin
|
||||
else:
|
||||
pg_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR))
|
||||
print('postgres dir is', pg_dir)
|
||||
if not os.path.exists(os.path.join(pg_dir, 'bin/postgres')):
|
||||
raise Exception('postgres not found at "{}"'.format(pg_dir))
|
||||
return pg_dir
|
||||
2
test_runner/pytest.ini
Normal file
2
test_runner/pytest.ini
Normal file
@@ -0,0 +1,2 @@
|
||||
[pytest]
|
||||
minversion = 6.0
|
||||
33
test_runner/test_broken.py
Normal file
33
test_runner/test_broken.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import pytest
|
||||
import os
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
"""
|
||||
|
||||
Use this test to see what happens when tests fail.
|
||||
|
||||
We should be able to clean up after ourselves, including stopping any
|
||||
postgres or pageserver processes.
|
||||
|
||||
Set the environment variable RUN_BROKEN to see this test run (and fail,
|
||||
and hopefully not leave any server processes behind).
|
||||
|
||||
"""
|
||||
|
||||
|
||||
run_broken = pytest.mark.skipif(
|
||||
os.environ.get('RUN_BROKEN') == None,
|
||||
reason="only used for testing the fixtures"
|
||||
)
|
||||
|
||||
@run_broken
|
||||
def test_broken(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_broken", "empty"]);
|
||||
|
||||
pg = postgres.create_start("test_broken")
|
||||
print('postgres is running')
|
||||
|
||||
print('THIS NEXT COMMAND WILL FAIL:')
|
||||
pg_bin.run('pgbench -i_am_a_broken_test'.split())
|
||||
11
test_runner/zenith_regress/.gitignore
vendored
Normal file
11
test_runner/zenith_regress/.gitignore
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
# Local binaries
|
||||
/pg_regress
|
||||
|
||||
# Generated subdirectories
|
||||
/tmp_check/
|
||||
/results/
|
||||
/log/
|
||||
|
||||
# Note: regression.* are only left behind on a failure; that's why they're not ignored
|
||||
#/regression.diffs
|
||||
#/regression.out
|
||||
11
test_runner/zenith_regress/README.md
Normal file
11
test_runner/zenith_regress/README.md
Normal file
@@ -0,0 +1,11 @@
|
||||
To add a new SQL test
|
||||
|
||||
- add sql script to run to zenith_regress/sql/testname.sql
|
||||
- add expected output to zenith/regress/expected/testname.out
|
||||
- add testname to both parallel_schedule and serial_schedule files*
|
||||
|
||||
That's it.
|
||||
For more complex tests see PostgreSQL regression tests. These works basically the same.
|
||||
|
||||
*it was changed recently in PostgreSQL upstream - no more separate serial_schedule.
|
||||
Someday we'll catch up with these changes.
|
||||
9
test_runner/zenith_regress/expected/.gitignore
vendored
Normal file
9
test_runner/zenith_regress/expected/.gitignore
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
/constraints.out
|
||||
/copy.out
|
||||
/create_function_1.out
|
||||
/create_function_2.out
|
||||
/largeobject.out
|
||||
/largeobject_1.out
|
||||
/misc.out
|
||||
/security_label.out
|
||||
/tablespace.out
|
||||
34
test_runner/zenith_regress/expected/zenith-cid.out
Normal file
34
test_runner/zenith_regress/expected/zenith-cid.out
Normal file
@@ -0,0 +1,34 @@
|
||||
BEGIN;
|
||||
SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
|
||||
CREATE TABLE cursor (a int);
|
||||
INSERT INTO cursor VALUES (1);
|
||||
DECLARE c1 NO SCROLL CURSOR FOR SELECT * FROM cursor FOR UPDATE;
|
||||
UPDATE cursor SET a = 2;
|
||||
FETCH ALL FROM c1;
|
||||
a
|
||||
---
|
||||
(0 rows)
|
||||
|
||||
COMMIT;
|
||||
DROP TABLE cursor;
|
||||
create table to_be_evicted(x bigint);
|
||||
begin;
|
||||
insert into to_be_evicted values (1);
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
select sum(x) from to_be_evicted;
|
||||
sum
|
||||
-------------
|
||||
25937424601
|
||||
(1 row)
|
||||
|
||||
end;
|
||||
drop table to_be_evicted;
|
||||
15
test_runner/zenith_regress/expected/zenith-clog.out
Normal file
15
test_runner/zenith_regress/expected/zenith-clog.out
Normal file
@@ -0,0 +1,15 @@
|
||||
create or replace procedure do_commits() as $$
|
||||
declare
|
||||
xid xid8;
|
||||
i integer;
|
||||
begin
|
||||
for i in 1..1000000 loop
|
||||
xid = txid_current();
|
||||
commit;
|
||||
if (pg_xact_status(xid) <> 'committed') then
|
||||
raise exception 'CLOG corruption';
|
||||
end if;
|
||||
end loop;
|
||||
end;
|
||||
$$ language plpgsql;
|
||||
call do_commits();
|
||||
19
test_runner/zenith_regress/expected/zenith-rel-truncate.out
Normal file
19
test_runner/zenith_regress/expected/zenith-rel-truncate.out
Normal file
@@ -0,0 +1,19 @@
|
||||
--
|
||||
-- Test that when a relation is truncated by VACUUM, the next smgrnblocks()
|
||||
-- query to get the relation's size returns the new size.
|
||||
-- (This isn't related to the TRUNCATE command, which works differently,
|
||||
-- by creating a new relation file)
|
||||
--
|
||||
CREATE TABLE truncatetest (i int);
|
||||
INSERT INTO truncatetest SELECT g FROM generate_series(1, 10000) g;
|
||||
-- Remove all the rows, and run VACUUM to remove the dead tuples and
|
||||
-- truncate the physical relation to 0 blocks.
|
||||
DELETE FROM truncatetest;
|
||||
VACUUM truncatetest;
|
||||
-- Check that a SeqScan sees correct relation size (which is now 0)
|
||||
SELECT * FROM truncatetest;
|
||||
i
|
||||
---
|
||||
(0 rows)
|
||||
|
||||
DROP TABLE truncatetest;
|
||||
304
test_runner/zenith_regress/expected/zenith-vacuum-full.out
Normal file
304
test_runner/zenith_regress/expected/zenith-vacuum-full.out
Normal file
@@ -0,0 +1,304 @@
|
||||
create table foo(a int primary key, b int, c int);
|
||||
insert into foo values (generate_series(1,10000), generate_series(1,10000), generate_series(1,10000));
|
||||
create index concurrently on foo(b);
|
||||
create index concurrently on foo(c);
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
drop table foo;
|
||||
11
test_runner/zenith_regress/parallel_schedule
Normal file
11
test_runner/zenith_regress/parallel_schedule
Normal file
@@ -0,0 +1,11 @@
|
||||
# ----------
|
||||
# src/test/regress/parallel_schedule
|
||||
#
|
||||
# By convention, we put no more than twenty tests in any one parallel group;
|
||||
# this limits the number of connections needed to run the tests.
|
||||
# ----------
|
||||
|
||||
test: zenith-cid
|
||||
test: zenith-rel-truncate
|
||||
test: zenith-clog
|
||||
test: zenith-vacuum-full
|
||||
6
test_runner/zenith_regress/serial_schedule
Normal file
6
test_runner/zenith_regress/serial_schedule
Normal file
@@ -0,0 +1,6 @@
|
||||
# src/test/regress/serial_schedule
|
||||
# This should probably be in an order similar to parallel_schedule.
|
||||
test: zenith-cid
|
||||
test: zenith-rel-truncate
|
||||
test: zenith-clog
|
||||
test: zenith-vacuum-full
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user