mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-22 04:42:56 +00:00
Compare commits
159 Commits
two_phase_
...
parallel_w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
de02cc9ee4 | ||
|
|
96e73fb585 | ||
|
|
a533d22f71 | ||
|
|
0ab4792943 | ||
|
|
c60e3e2337 | ||
|
|
1e0e3fbde0 | ||
|
|
7c96c638aa | ||
|
|
9838c71a47 | ||
|
|
79d9314ba6 | ||
|
|
2b33894e7b | ||
|
|
a118557331 | ||
|
|
8ec234ba78 | ||
|
|
70926adaba | ||
|
|
560f088f05 | ||
|
|
aa404b60fe | ||
|
|
1b6d99db7c | ||
|
|
605b90c6c7 | ||
|
|
6f747893be | ||
|
|
dab34c3dd6 | ||
|
|
bf45bef284 | ||
|
|
d55095ab21 | ||
|
|
a048e0c7c1 | ||
|
|
9436c4161f | ||
|
|
e74b06d999 | ||
|
|
f6705b7a7d | ||
|
|
386403dcd1 | ||
|
|
46e613f423 | ||
|
|
56da87cadc | ||
|
|
dcabe694ee | ||
|
|
842419b91f | ||
|
|
3cded20662 | ||
|
|
eb0a56eb22 | ||
|
|
8a541147e2 | ||
|
|
ed0fcfa9b7 | ||
|
|
c5509b05de | ||
|
|
befefe8d84 | ||
|
|
ad92b66eed | ||
|
|
d119f2bcce | ||
|
|
038dc6c629 | ||
|
|
bfc27bee5e | ||
|
|
19528de03e | ||
|
|
3e69c41a47 | ||
|
|
97681acfcf | ||
|
|
baf8800b96 | ||
|
|
577af8a459 | ||
|
|
75e717fe86 | ||
|
|
4987d5ee1f | ||
|
|
462b8801d2 | ||
|
|
2712eaee15 | ||
|
|
96f4ddd243 | ||
|
|
3386ce6f35 | ||
|
|
9c856ecf43 | ||
|
|
d4329887b3 | ||
|
|
ced338fd20 | ||
|
|
44c35722d8 | ||
|
|
ec44f4b299 | ||
|
|
a31bba19b0 | ||
|
|
0dd46061a8 | ||
|
|
cb2ddf06d0 | ||
|
|
eb7388e3e8 | ||
|
|
b314311f49 | ||
|
|
183a3022a5 | ||
|
|
226204094a | ||
|
|
4f1b22a2c8 | ||
|
|
257ade0688 | ||
|
|
43ece6e2a2 | ||
|
|
f923464b93 | ||
|
|
11efafb05b | ||
|
|
7c5532303e | ||
|
|
39c1d4896c | ||
|
|
d2d5a01522 | ||
|
|
36d6c401bf | ||
|
|
37b0236e9a | ||
|
|
cc169a6896 | ||
|
|
77366b7a76 | ||
|
|
9af04b3b8d | ||
|
|
9b7b311815 | ||
|
|
cb4f5e911c | ||
|
|
6403f1745c | ||
|
|
6d7942ece4 | ||
|
|
09b2c66cf6 | ||
|
|
5c70b52f4a | ||
|
|
19602dc88a | ||
|
|
2b66049b21 | ||
|
|
c3011359ab | ||
|
|
da117f431d | ||
|
|
78aad4fe3f | ||
|
|
1c5be12515 | ||
|
|
96c7594d29 | ||
|
|
7a3794ef18 | ||
|
|
bf56ea8c43 | ||
|
|
bb1446e33a | ||
|
|
0969574d48 | ||
|
|
05a681be2c | ||
|
|
507c1fbbac | ||
|
|
b2f51026aa | ||
|
|
2b0193e6bf | ||
|
|
c31a5e2c8f | ||
|
|
d85d67a6f1 | ||
|
|
9b8e82c6cf | ||
|
|
434374ebb4 | ||
|
|
a7ae552851 | ||
|
|
8b5a061c8e | ||
|
|
8147aa7e93 | ||
|
|
d18cc8a3a8 | ||
|
|
762e9859d6 | ||
|
|
924261f7db | ||
|
|
063429aade | ||
|
|
445e88f50b | ||
|
|
47694ea4f5 | ||
|
|
3364a8d442 | ||
|
|
f2243d7459 | ||
|
|
244fcffc50 | ||
|
|
8c3c9c3394 | ||
|
|
00ce635da9 | ||
|
|
7d5f7462c1 | ||
|
|
bed75f800f | ||
|
|
0e423d481e | ||
|
|
0c74f6fa4e | ||
|
|
fc01fae9b4 | ||
|
|
558a2214bc | ||
|
|
31815bccc0 | ||
|
|
e12cab7c17 | ||
|
|
002cd8ed5b | ||
|
|
588a030803 | ||
|
|
0b9bbdc4ec | ||
|
|
922715a923 | ||
|
|
ac60b68d50 | ||
|
|
1aceea1bdd | ||
|
|
e0cc4dee4f | ||
|
|
fd20101e5c | ||
|
|
5a73a6fdfc | ||
|
|
4608b1ec70 | ||
|
|
ccb2eea7fd | ||
|
|
6b615cbde1 | ||
|
|
adc0e04205 | ||
|
|
78e3edf2e9 | ||
|
|
4b6563a55d | ||
|
|
34f4207501 | ||
|
|
d1d2d5ce69 | ||
|
|
60ea26316e | ||
|
|
cffc979058 | ||
|
|
1af6607fc3 | ||
|
|
cb6e2d9ddb | ||
|
|
1ccf82f932 | ||
|
|
b1a424dfa9 | ||
|
|
263acef1cc | ||
|
|
7c73afc1af | ||
|
|
e8f0a9bb80 | ||
|
|
6f9175ca2d | ||
|
|
69fa10ff86 | ||
|
|
d5fe515363 | ||
|
|
6a9c036ac1 | ||
|
|
6f9a582973 | ||
|
|
a0e23e6f3f | ||
|
|
84508d4f68 | ||
|
|
fb230dcf32 | ||
|
|
4aabc9a682 | ||
|
|
0fe81b2993 |
@@ -37,7 +37,7 @@ jobs:
|
||||
command: |
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libxml2-dev libcurl4-openssl-dev
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
|
||||
fi
|
||||
|
||||
# Build postgres if the restore_cache didn't find a build.
|
||||
@@ -119,8 +119,7 @@ jobs:
|
||||
- target
|
||||
|
||||
# Run rust unit tests
|
||||
# FIXME: remove -p zenith_utils once integration tests are moved to python
|
||||
- run: cargo test -p zenith_utils
|
||||
- run: cargo test
|
||||
|
||||
# Install the rust binaries, for use by test jobs
|
||||
# `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
|
||||
@@ -192,7 +191,12 @@ jobs:
|
||||
condition: << parameters.needs_postgres_source >>
|
||||
steps:
|
||||
- run: git submodule update --init --depth 1
|
||||
- run: pip install pytest psycopg2
|
||||
- run:
|
||||
name: Install pipenv & deps
|
||||
working_directory: test_runner
|
||||
command: |
|
||||
pip install pipenv
|
||||
pipenv install
|
||||
- run:
|
||||
name: Run pytest
|
||||
working_directory: test_runner
|
||||
@@ -211,25 +215,21 @@ jobs:
|
||||
#
|
||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||
# in its "Tests" tab in the results page.
|
||||
pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short $TEST_SELECTION $EXTRA_PARAMS
|
||||
# -s prevents pytest from capturing output, which helps to see
|
||||
# what's going on if the test hangs
|
||||
# --verbose prints name of each test (helpful when there are
|
||||
# multiple tests in one file)
|
||||
# -rA prints summary in the end
|
||||
pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short -s --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
- run:
|
||||
# CircleCI artifacts are preserved one file at a time, so skipping
|
||||
# this step isn't a good idea. If you want to extract the
|
||||
# pageserver state, perhaps a tarball would be a better idea.
|
||||
name: Delete pageserver data
|
||||
name: Delete all data but logs
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
for DIR in /tmp/test_output/*; do
|
||||
mv $DIR/repo/pageserver.log $DIR/ || true # ignore errors
|
||||
for PGDIR in $DIR/repo/pgdatadirs/pg?; do
|
||||
echo "PGDIR: $PGDIR"
|
||||
NEW_LOG="${PGDIR##*/}_log"
|
||||
mv $PGDIR/log "$DIR/$NEW_LOG" || true # ignore errors
|
||||
done
|
||||
echo "rm $DIR/repo"
|
||||
rm -rf $DIR/repo
|
||||
done
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
|
||||
13
.dockerignore
Normal file
13
.dockerignore
Normal file
@@ -0,0 +1,13 @@
|
||||
**/.git/
|
||||
**/__pycache__
|
||||
**/.pytest_cache
|
||||
|
||||
/target
|
||||
/tmp_check
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
/test_output
|
||||
/.vscode
|
||||
/.zenith
|
||||
/integration_tests/.zenith
|
||||
/Dockerfile
|
||||
2
.github/workflows/testing.yml
vendored
2
.github/workflows/testing.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
- name: Install postgres dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libxml2-dev libcurl4-openssl-dev
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
|
||||
|
||||
- name: Set pg revision for caching
|
||||
id: pg_ver
|
||||
|
||||
31
CONTRIBUTING.md
Normal file
31
CONTRIBUTING.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# How to contribute
|
||||
|
||||
Howdy! Usual good software engineering practices apply. Write
|
||||
tests. Write comments. Follow standard Rust coding practices where
|
||||
possible. Use 'cargo fmt' and 'clippy' to tidy up formatting.
|
||||
|
||||
There are soft spots in the code, which could use cleanup,
|
||||
refactoring, additional comments, and so forth. Let's try to raise the
|
||||
bar, and clean things up as we go. Try to leave code in a better shape
|
||||
than it was before.
|
||||
|
||||
## Submitting changes
|
||||
|
||||
1. Make a PR for every change.
|
||||
|
||||
Even seemingly trivial patches can break things in surprising ways.
|
||||
Use of common sense is OK. If you're only fixing a typo in a comment,
|
||||
it's probably fine to just push it. But if in doubt, open a PR.
|
||||
|
||||
2. Get at least one +1 on your PR before you push.
|
||||
|
||||
For simple patches, it will only take a minute for someone to review
|
||||
it.
|
||||
|
||||
3. Always keep the CI green.
|
||||
|
||||
Do not push, if the CI failed on your PR. Even if you think it's not
|
||||
your patch's fault. Help to fix the root cause if something else has
|
||||
broken the CI, before pushing.
|
||||
|
||||
*Happy Hacking!*
|
||||
20
COPYRIGHT
Normal file
20
COPYRIGHT
Normal file
@@ -0,0 +1,20 @@
|
||||
This software is licensed under the Apache 2.0 License:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
Copyright 2021 Zenith Labs, Inc
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
The PostgreSQL submodule in vendor/postgres is licensed under the
|
||||
PostgreSQL license. See vendor/postgres/COPYRIGHT.
|
||||
513
Cargo.lock
generated
513
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,5 @@
|
||||
[workspace]
|
||||
members = [
|
||||
"integration_tests",
|
||||
"pageserver",
|
||||
"walkeeper",
|
||||
"zenith",
|
||||
@@ -8,4 +7,10 @@ members = [
|
||||
"postgres_ffi",
|
||||
"zenith_utils",
|
||||
"workspace_hack",
|
||||
"proxy"
|
||||
]
|
||||
|
||||
[profile.release]
|
||||
# This is useful for profiling and, to some extent, debug.
|
||||
# Besides, debug info should not affect the performance.
|
||||
debug = true
|
||||
|
||||
95
Dockerfile
Normal file
95
Dockerfile
Normal file
@@ -0,0 +1,95 @@
|
||||
#
|
||||
# Docker image for console integration testing.
|
||||
#
|
||||
# We may also reuse it in CI to unify installation process and as a general binaries building
|
||||
# tool for production servers.
|
||||
#
|
||||
# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls
|
||||
# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust
|
||||
# images which are statically linked and have guards against any dlopen. I would rather
|
||||
# prefer all static binaries so we may change the way librocksdb-sys builds or wait until
|
||||
# we will have our own storage and drop rockdb dependency.
|
||||
#
|
||||
# Cargo-chef is used to separate dependencies building from main binaries building. This
|
||||
# way `docker build` will download and install dependencies only of there are changes to
|
||||
# out Cargo.toml files.
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# build postgres separately -- this layer will be rebuilt only if one of
|
||||
# mentioned paths will get any changes
|
||||
#
|
||||
FROM alpine:3.13 as pg-build
|
||||
RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \
|
||||
make bison flex readline-dev zlib-dev perl linux-headers
|
||||
WORKDIR zenith
|
||||
COPY ./vendor/postgres vendor/postgres
|
||||
COPY ./Makefile Makefile
|
||||
# Build using clang and lld
|
||||
RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4
|
||||
|
||||
#
|
||||
# Calculate cargo dependencies.
|
||||
# This will always run, but only generate recipe.json with list of dependencies without
|
||||
# installing them.
|
||||
#
|
||||
FROM alpine:20210212 as cargo-deps-inspect
|
||||
RUN apk add --update rust cargo
|
||||
RUN cargo install cargo-chef
|
||||
WORKDIR zenith
|
||||
COPY . .
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
#
|
||||
# Build cargo dependencies.
|
||||
# This temp cantainner would be build only if recipe.json was changed.
|
||||
#
|
||||
FROM alpine:20210212 as deps-build
|
||||
RUN apk add --update rust cargo openssl-dev clang build-base
|
||||
# rust-rocksdb can be built against system-wide rocksdb -- that saves about
|
||||
# 10 minutes during build. Rocksdb apk package is in testing now, but use it
|
||||
# anyway. In case of any troubles we can download and build rocksdb here manually
|
||||
# (to cache it as a docker layer).
|
||||
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
|
||||
WORKDIR zenith
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/
|
||||
COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
|
||||
RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
|
||||
|
||||
#
|
||||
# Build zenith binaries
|
||||
#
|
||||
FROM alpine:20210212 as build
|
||||
RUN apk add --update rust cargo openssl-dev clang build-base
|
||||
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
|
||||
WORKDIR zenith
|
||||
COPY . .
|
||||
# Copy cached dependencies
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY --from=deps-build /zenith/target target
|
||||
COPY --from=deps-build /root/.cargo /root/.cargo
|
||||
RUN cargo build --release
|
||||
|
||||
#
|
||||
# Copy binaries to resulting image.
|
||||
# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure
|
||||
# out how to statically link rocksdb or avoid it at all).
|
||||
#
|
||||
FROM alpine:3.13
|
||||
RUN apk add --update openssl build-base
|
||||
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/proxy /usr/local/bin
|
||||
COPY --from=pg-build /zenith/tmp_install /usr/local
|
||||
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||
|
||||
RUN addgroup zenith && adduser -h /data -D -G zenith zenith
|
||||
VOLUME ["/data"]
|
||||
WORKDIR /data
|
||||
USER zenith
|
||||
EXPOSE 6400
|
||||
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||
CMD ["pageserver"]
|
||||
202
LICENSE
Normal file
202
LICENSE
Normal file
@@ -0,0 +1,202 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
19
Makefile
19
Makefile
@@ -1,3 +1,11 @@
|
||||
# Seccomp BPF is only available for Linux
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
SECCOMP = --with-libseccomp
|
||||
else
|
||||
SECCOMP =
|
||||
endif
|
||||
|
||||
#
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
@@ -21,8 +29,12 @@ tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
|
||||
--enable-depend --with-libxml --prefix=$(abspath tmp_install) > configure.log)
|
||||
../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
|
||||
--enable-cassert \
|
||||
--enable-debug \
|
||||
--enable-depend \
|
||||
$(SECCOMP) \
|
||||
--prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
# nicer alias for running 'configure'
|
||||
postgres-configure: tmp_install/build/config.status
|
||||
@@ -38,8 +50,7 @@ postgres: postgres-configure
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
||||
+@echo "Compiling contrib/zenith"
|
||||
(cd vendor/postgres/contrib/zenith && \
|
||||
$(MAKE) PG_CONFIG=$(abspath tmp_install)/bin/pg_config install USE_PGXS=1)
|
||||
$(MAKE) -C tmp_install/build/contrib/zenith install
|
||||
|
||||
postgres-clean:
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
||||
|
||||
1
Pipfile.lock
generated
Symbolic link
1
Pipfile.lock
generated
Symbolic link
@@ -0,0 +1 @@
|
||||
./test_runner/Pipfile.lock
|
||||
84
README.md
84
README.md
@@ -4,14 +4,32 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus
|
||||
|
||||
## Running local installation
|
||||
|
||||
1. Build zenith and patched postgres
|
||||
1. Install build dependencies and other useful packages
|
||||
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang
|
||||
```
|
||||
|
||||
[Rust] 1.48 or later is also required.
|
||||
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
|
||||
To run the integration tests (not required to use the code), install
|
||||
Python (3.6 or higher), and install python3 packages with `pip` (called `pip3` on some systems):
|
||||
```
|
||||
pip install pytest psycopg2
|
||||
```
|
||||
|
||||
2. Build zenith and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
cd zenith
|
||||
make -j5
|
||||
```
|
||||
|
||||
2. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
3. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
```sh
|
||||
# Create repository in .zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
@@ -35,8 +53,8 @@ BRANCH ADDRESS LSN STATUS
|
||||
main 127.0.0.1:55432 0/1609610 running
|
||||
```
|
||||
|
||||
3. Now it is possible to connect to postgres and run some queries:
|
||||
```sh
|
||||
4. Now it is possible to connect to postgres and run some queries:
|
||||
```text
|
||||
> psql -p55432 -h 127.0.0.1 postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
@@ -49,7 +67,7 @@ postgres=# select * from t;
|
||||
(1 row)
|
||||
```
|
||||
|
||||
4. And create branches and run postgres on them:
|
||||
5. And create branches and run postgres on them:
|
||||
```sh
|
||||
# create branch named migration_check
|
||||
> ./target/debug/zenith branch migration_check main
|
||||
@@ -83,33 +101,67 @@ INSERT 0 1
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
make # builds also postgres and installs it to ./tmp_install
|
||||
cd test_runner
|
||||
pytest
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
Now we use README files to cover design ideas and overall architecture for each module.
|
||||
And rustdoc style documentation comments.
|
||||
|
||||
To view your documentation in a browser, try running `cargo doc --no-deps --open`
|
||||
|
||||
## Source tree layout
|
||||
|
||||
/walkeeper:
|
||||
`/control_plane`:
|
||||
|
||||
WAL safekeeper. Written in Rust.
|
||||
Local control plane.
|
||||
Functions to start, configure and stop pageserver and postgres instances running as a local processes.
|
||||
Intended to be used in integration tests and in CLI tools for local installations.
|
||||
|
||||
/pageserver:
|
||||
`/zenith`
|
||||
|
||||
Main entry point for the 'zenith' CLI utility.
|
||||
TODO: Doesn't it belong to control_plane?
|
||||
|
||||
`/postgres_ffi`:
|
||||
|
||||
Utility functions for interacting with PostgreSQL file formats.
|
||||
Misc constants, copied from PostgreSQL headers.
|
||||
|
||||
`/zenith_utils`:
|
||||
|
||||
Helpers that are shared between other crates in this repository.
|
||||
|
||||
`/walkeeper`:
|
||||
|
||||
WAL safekeeper (also known as WAL acceptor). Written in Rust.
|
||||
|
||||
`/pageserver`:
|
||||
|
||||
Page Server. Written in Rust.
|
||||
|
||||
Depends on the modified 'postgres' binary for WAL redo.
|
||||
|
||||
/integration_tests:
|
||||
|
||||
Tests with different combinations of a Postgres compute node, WAL safekeeper and Page Server.
|
||||
|
||||
/vendor/postgres:
|
||||
`/vendor/postgres`:
|
||||
|
||||
PostgreSQL source tree, with the modifications needed for Zenith.
|
||||
|
||||
/vendor/postgres/src/bin/safekeeper:
|
||||
`/vendor/postgres/contrib/zenith`:
|
||||
|
||||
Extension (safekeeper_proxy) that runs in the compute node, and connects to the WAL safekeepers
|
||||
and streams the WAL
|
||||
PostgreSQL extension that implements storage manager API and network communications with remote page server.
|
||||
|
||||
`/test_runner`:
|
||||
|
||||
Integration tests, written in Python using the `pytest` framework.
|
||||
|
||||
`test_runner/zenith_regress`:
|
||||
|
||||
Quick way to add new SQL regression test to integration tests set.
|
||||
|
||||
`/integration_tests`:
|
||||
|
||||
Another pack of integration tests. Written in Rust.
|
||||
|
||||
[Rust]: https://www.rust-lang.org/learn/get-started
|
||||
|
||||
@@ -16,11 +16,8 @@ toml = "0.5"
|
||||
lazy_static = "1.4"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
# hex = "0.4.3"
|
||||
bytes = "1.0.1"
|
||||
# fs_extra = "1.2.0"
|
||||
nix = "0.20"
|
||||
# thiserror = "1"
|
||||
url = "2.2.2"
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
|
||||
@@ -14,6 +14,7 @@ use std::{
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use pageserver::ZTimelineId;
|
||||
@@ -97,51 +98,6 @@ impl ComputeControlPlane {
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
pub fn new_test_node(&mut self, branch_name: &str) -> Arc<PostgresNode> {
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
.branch_get_by_name(branch_name)
|
||||
.expect("failed to get timeline_id")
|
||||
.timeline_id;
|
||||
|
||||
let node = self.new_from_page_server(true, timeline_id, branch_name);
|
||||
let node = node.unwrap();
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"shared_preload_libraries = zenith\n\
|
||||
zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_test_master_node(&mut self, branch_name: &str) -> Arc<PostgresNode> {
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
.branch_get_by_name(branch_name)
|
||||
.expect("failed to get timeline_id")
|
||||
.timeline_id;
|
||||
|
||||
let node = self
|
||||
.new_from_page_server(true, timeline_id, branch_name)
|
||||
.unwrap();
|
||||
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
"synchronous_standby_names = 'safekeeper_proxy'\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_node(&mut self, branch_name: &str) -> Result<Arc<PostgresNode>> {
|
||||
let timeline_id = self.pageserver.branch_get_by_name(branch_name)?.timeline_id;
|
||||
|
||||
@@ -151,8 +107,11 @@ impl ComputeControlPlane {
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"shared_preload_libraries = zenith\n\
|
||||
zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
concat!(
|
||||
"shared_preload_libraries = zenith\n",
|
||||
"synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
|
||||
"zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
),
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
@@ -252,6 +211,7 @@ impl PostgresNode {
|
||||
// new data directory
|
||||
pub fn init_from_page_server(&self) -> Result<()> {
|
||||
let pgdata = self.pgdata();
|
||||
|
||||
println!(
|
||||
"Extracting base backup to create postgres instance: path={} port={}",
|
||||
pgdata.display(),
|
||||
@@ -300,11 +260,13 @@ impl PostgresNode {
|
||||
ar.unpack(&pgdata)
|
||||
.with_context(|| "extracting page backup failed")?;
|
||||
|
||||
// listen for selected port
|
||||
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
|
||||
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"max_wal_senders = 10\n\
|
||||
wal_log_hints = on\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
@@ -328,21 +290,19 @@ impl PostgresNode {
|
||||
// Connect it to the page server.
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
let (host, port) = connection_host_port(&self.pageserver.connection_config());
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"shared_preload_libraries = zenith \n\
|
||||
zenith.page_server_connstring = 'host={} port={}'\n\
|
||||
zenith.zenith_timeline='{}'\n",
|
||||
self.pageserver.address().ip(),
|
||||
self.pageserver.address().port(),
|
||||
self.timelineid
|
||||
host, port, self.timelineid
|
||||
),
|
||||
)?;
|
||||
|
||||
fs::create_dir_all(self.pgdata().join("pg_wal"))?;
|
||||
fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
|
||||
self.pg_resetwal(&["-f"])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -381,7 +341,8 @@ impl PostgresNode {
|
||||
"-D",
|
||||
self.pgdata().to_str().unwrap(),
|
||||
"-l",
|
||||
self.pgdata().join("log").to_str().unwrap(),
|
||||
self.pgdata().join("pg.log").to_str().unwrap(),
|
||||
"-w", //wait till pg_ctl actually does what was asked
|
||||
],
|
||||
args,
|
||||
]
|
||||
@@ -398,19 +359,6 @@ impl PostgresNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pg_resetwal(&self, args: &[&str]) -> Result<()> {
|
||||
let pg_resetwal_path = self.env.pg_bin_dir().join("pg_resetwal");
|
||||
|
||||
let pg_ctl = Command::new(pg_resetwal_path)
|
||||
.args([&["-D", self.pgdata().to_str().unwrap()], args].concat())
|
||||
.status()
|
||||
.with_context(|| "pg_resetwal failed")?;
|
||||
if !pg_ctl.success() {
|
||||
anyhow::bail!("pg_resetwal failed");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"])
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//
|
||||
// Local control plane.
|
||||
//
|
||||
// Can start, cofigure and stop postgres instances running as a local processes.
|
||||
// Can start, configure and stop postgres instances running as a local processes.
|
||||
//
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
|
||||
@@ -49,7 +49,7 @@ impl LocalEnv {
|
||||
Ok(self
|
||||
.zenith_distrib_dir
|
||||
.as_ref()
|
||||
.ok_or(anyhow!("Can not manage remote pageserver"))?
|
||||
.ok_or_else(|| anyhow!("Can not manage remote pageserver"))?
|
||||
.join("pageserver"))
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::thread;
|
||||
@@ -8,11 +8,12 @@ use std::time::Duration;
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres::{Client, NoTls};
|
||||
use postgres::{Config, NoTls};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::read_pidfile;
|
||||
use pageserver::branches::BranchInfo;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
//
|
||||
// Control routines for pageserver.
|
||||
@@ -21,7 +22,7 @@ use pageserver::branches::BranchInfo;
|
||||
//
|
||||
pub struct PageServerNode {
|
||||
pub kill_on_exit: bool,
|
||||
pub listen_address: Option<SocketAddr>,
|
||||
pub connection_config: Option<Config>,
|
||||
pub env: LocalEnv,
|
||||
}
|
||||
|
||||
@@ -29,32 +30,36 @@ impl PageServerNode {
|
||||
pub fn from_env(env: &LocalEnv) -> PageServerNode {
|
||||
PageServerNode {
|
||||
kill_on_exit: false,
|
||||
listen_address: None, // default
|
||||
connection_config: None, // default
|
||||
env: env.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn address(&self) -> SocketAddr {
|
||||
match self.listen_address {
|
||||
Some(addr) => addr,
|
||||
None => "127.0.0.1:64000".parse().unwrap(),
|
||||
fn default_config() -> Config {
|
||||
"postgresql://no_user@localhost:64000/no_db"
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn connection_config(&self) -> Config {
|
||||
match &self.connection_config {
|
||||
Some(config) => config.clone(),
|
||||
None => Self::default_config(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init(&self) -> Result<()> {
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let status = cmd
|
||||
.args(&["--init", "-D", self.env.base_data_dir.to_str().unwrap()])
|
||||
.args(&[
|
||||
"--init",
|
||||
"-D",
|
||||
self.env.base_data_dir.to_str().unwrap(),
|
||||
"--postgres-distrib",
|
||||
self.env.pg_distrib_dir.to_str().unwrap(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.env(
|
||||
"POSTGRES_DISTRIB_DIR",
|
||||
self.env.pg_distrib_dir.to_str().unwrap(),
|
||||
)
|
||||
.env("ZENITH_REPO_DIR", self.repo_path())
|
||||
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pageserver init failed");
|
||||
|
||||
@@ -76,28 +81,15 @@ impl PageServerNode {
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!(
|
||||
"Starting pageserver at '{}' in {}",
|
||||
self.address(),
|
||||
connection_address(&self.connection_config()),
|
||||
self.repo_path().display()
|
||||
);
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
cmd.args(&[
|
||||
"-l",
|
||||
self.address().to_string().as_str(),
|
||||
"-D",
|
||||
self.repo_path().to_str().unwrap(),
|
||||
])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.env(
|
||||
"POSTGRES_DISTRIB_DIR",
|
||||
self.env.pg_distrib_dir.to_str().unwrap(),
|
||||
)
|
||||
.env("ZENITH_REPO_DIR", self.repo_path())
|
||||
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
|
||||
cmd.args(&["-D", self.repo_path().to_str().unwrap()])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1");
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
@@ -131,42 +123,29 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
// wait for pageserver stop
|
||||
let address = connection_address(&self.connection_config());
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(self.address());
|
||||
let stream = TcpStream::connect(&address);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
if let Err(_e) = stream {
|
||||
println!("Pageserver stopped");
|
||||
return Ok(());
|
||||
}
|
||||
println!("Stopping pageserver on {}", self.address());
|
||||
println!("Stopping pageserver on {}", address);
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
self.address().port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
let mut client = self.connection_config().connect(NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{}'", sql);
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
self.address().port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls)
|
||||
self.connection_config().connect(NoTls)
|
||||
}
|
||||
|
||||
pub fn branches_list(&self) -> Result<Vec<BranchInfo>> {
|
||||
@@ -225,22 +204,6 @@ impl PageServerNode {
|
||||
|
||||
Ok(branch.clone())
|
||||
}
|
||||
|
||||
pub fn system_id_get(&self) -> Result<u64> {
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
let query_result = client
|
||||
.simple_query("identify_system")?
|
||||
.first()
|
||||
.map(|msg| match msg {
|
||||
postgres::SimpleQueryMessage::Row(row) => row.get(0),
|
||||
_ => None,
|
||||
})
|
||||
.flatten()
|
||||
.ok_or_else(|| anyhow!("failed to get system_id"))?
|
||||
.parse::<u64>()?;
|
||||
|
||||
Ok(query_result)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
|
||||
11
docker-entrypoint.sh
Executable file
11
docker-entrypoint.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/sh
|
||||
if [ "$1" = 'pageserver' ]; then
|
||||
if [ ! -d "/data/timelines" ]; then
|
||||
echo "Initializing pageserver data directory"
|
||||
pageserver --init -D /data --postgres-distrib /usr/local
|
||||
fi
|
||||
echo "Staring pageserver at 0.0.0.0:6400"
|
||||
pageserver -l 0.0.0.0:6400 -D /data
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
1
integration_tests/.gitignore
vendored
1
integration_tests/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
tmp_check/
|
||||
@@ -1,18 +0,0 @@
|
||||
[package]
|
||||
name = "integration_tests"
|
||||
version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
lazy_static = "1.4.0"
|
||||
rand = "0.8.3"
|
||||
anyhow = "1.0"
|
||||
nix = "0.20"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
@@ -1,416 +0,0 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::Read;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, ExitStatus};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres;
|
||||
|
||||
use control_plane::compute::PostgresNode;
|
||||
use control_plane::read_pidfile;
|
||||
use control_plane::{local_env::LocalEnv, storage::PageServerNode};
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
fn cargo_bin_dir() -> PathBuf {
|
||||
let mut pathbuf = std::env::current_exe().unwrap();
|
||||
|
||||
pathbuf.pop();
|
||||
if pathbuf.ends_with("deps") {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
pathbuf
|
||||
}
|
||||
|
||||
// local compute env for tests
|
||||
pub fn create_test_env(testname: &str) -> LocalEnv {
|
||||
let base_path = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_check/")
|
||||
.join(testname);
|
||||
|
||||
let base_path_str = base_path.to_str().unwrap();
|
||||
|
||||
// Remove remnants of old test repo
|
||||
let _ = fs::remove_dir_all(&base_path);
|
||||
|
||||
fs::create_dir_all(&base_path)
|
||||
.expect(format!("could not create directory for {}", base_path_str).as_str());
|
||||
|
||||
let pgdatadirs_path = base_path.join("pgdatadirs");
|
||||
fs::create_dir(&pgdatadirs_path)
|
||||
.expect(format!("could not create directory {:?}", pgdatadirs_path).as_str());
|
||||
|
||||
LocalEnv {
|
||||
pageserver_connstring: "postgresql://127.0.0.1:64000".to_string(),
|
||||
pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
|
||||
zenith_distrib_dir: Some(cargo_bin_dir()),
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Collection of several example deployments useful for tests.
|
||||
//
|
||||
// I'm intendedly modelling storage and compute control planes as a separate entities
|
||||
// as it is closer to the actual setup.
|
||||
//
|
||||
pub struct TestStorageControlPlane {
|
||||
pub wal_acceptors: Vec<WalAcceptorNode>,
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
pub test_done: AtomicBool,
|
||||
}
|
||||
|
||||
impl TestStorageControlPlane {
|
||||
// postgres <-> page_server
|
||||
//
|
||||
// Initialize a new repository and configure a page server to run in it
|
||||
//
|
||||
pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
|
||||
let pserver = Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
});
|
||||
pserver.init().unwrap();
|
||||
pserver.start().unwrap();
|
||||
|
||||
TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: pserver,
|
||||
test_done: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
// postgres <-> {wal_acceptor1, wal_acceptor2, ...}
|
||||
pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
|
||||
let mut cplane = TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
}),
|
||||
test_done: AtomicBool::new(false),
|
||||
// repopath,
|
||||
};
|
||||
cplane.pageserver.init().unwrap();
|
||||
cplane.pageserver.start().unwrap();
|
||||
|
||||
let systemid = cplane.pageserver.system_id_get().unwrap();
|
||||
|
||||
const WAL_ACCEPTOR_PORT: usize = 54321;
|
||||
|
||||
let datadir_base = local_env.base_data_dir.join("safekeepers");
|
||||
fs::create_dir_all(&datadir_base).unwrap();
|
||||
|
||||
for i in 0..redundancy {
|
||||
let wal_acceptor = WalAcceptorNode {
|
||||
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
data_dir: datadir_base.join(format!("wal_acceptor_{}", i)),
|
||||
systemid,
|
||||
env: local_env.clone(),
|
||||
pass_to_pageserver: true,
|
||||
};
|
||||
wal_acceptor.init();
|
||||
wal_acceptor.start();
|
||||
cplane.wal_acceptors.push(wal_acceptor);
|
||||
}
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
for wa in self.wal_acceptors.iter() {
|
||||
let _ = wa.stop();
|
||||
}
|
||||
self.test_done.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn get_wal_acceptor_conn_info(&self) -> String {
|
||||
self.wal_acceptors
|
||||
.iter()
|
||||
.map(|wa| wa.listen.to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub fn is_running(&self) -> bool {
|
||||
self.test_done.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestStorageControlPlane {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// PostgresNodeExt
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
///
|
||||
/// Testing utilities for PostgresNode type
|
||||
///
|
||||
pub trait PostgresNodeExt {
|
||||
fn pg_regress(&self) -> ExitStatus;
|
||||
fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus;
|
||||
fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode;
|
||||
fn open_psql(&self, db: &str) -> postgres::Client;
|
||||
fn dump_log_file(&self);
|
||||
fn safe_psql(&self, db: &str, sql: &str) -> Vec<postgres::Row>;
|
||||
}
|
||||
|
||||
impl PostgresNodeExt for PostgresNode {
|
||||
fn pg_regress(&self) -> ExitStatus {
|
||||
self.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
|
||||
let regress_run_path = self.env.base_data_dir.join("regress");
|
||||
fs::create_dir_all(®ress_run_path).unwrap();
|
||||
fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGPORT", self.address.port().to_string())
|
||||
.env("PGUSER", self.whoami())
|
||||
.env("PGHOST", self.address.ip().to_string())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
if !regress_check.success() {
|
||||
if let Ok(mut file) = File::open("regression.diffs") {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- regression.diffs:\n{}", buffer);
|
||||
}
|
||||
self.dump_log_file();
|
||||
}
|
||||
regress_check
|
||||
}
|
||||
|
||||
fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus {
|
||||
let port = self.address.port().to_string();
|
||||
let clients = clients.to_string();
|
||||
let seconds = seconds.to_string();
|
||||
let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&["-i", "-p", port.as_str(), "postgres"])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench -i");
|
||||
let pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&[
|
||||
"-p",
|
||||
port.as_str(),
|
||||
"-T",
|
||||
seconds.as_str(),
|
||||
"-P",
|
||||
"1",
|
||||
"-c",
|
||||
clients.as_str(),
|
||||
"-M",
|
||||
"prepared",
|
||||
"postgres",
|
||||
])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench run");
|
||||
pg_bench_run
|
||||
}
|
||||
|
||||
fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
|
||||
let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
|
||||
match Command::new(proxy_path.as_path())
|
||||
.args(&["--ztimelineid", &self.timelineid.to_string()])
|
||||
.args(&["-s", wal_acceptors])
|
||||
.args(&["-h", &self.address.ip().to_string()])
|
||||
.args(&["-p", &self.address.port().to_string()])
|
||||
.arg("-v")
|
||||
.stderr(
|
||||
OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(self.pgdata().join("safekeeper_proxy.log"))
|
||||
.unwrap(),
|
||||
)
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => WalProposerNode { pid: child.id() },
|
||||
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
|
||||
}
|
||||
}
|
||||
|
||||
fn dump_log_file(&self) {
|
||||
if let Ok(mut file) = File::open(self.env.pageserver_data_dir().join("pageserver.log")) {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- pageserver.log:\n{}", buffer);
|
||||
}
|
||||
}
|
||||
|
||||
fn safe_psql(&self, db: &str, sql: &str) -> Vec<postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
let mut client = postgres::Client::connect(connstring.as_str(), postgres::NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
let result = client.query(sql, &[]);
|
||||
if result.is_err() {
|
||||
self.dump_log_file();
|
||||
}
|
||||
result.unwrap()
|
||||
}
|
||||
|
||||
fn open_psql(&self, db: &str) -> postgres::Client {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
postgres::Client::connect(connstring.as_str(), postgres::NoTls).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// WalAcceptorNode
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Control routines for WalAcceptor.
|
||||
//
|
||||
// Now used only in test setups.
|
||||
//
|
||||
pub struct WalAcceptorNode {
|
||||
listen: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
systemid: u64,
|
||||
env: LocalEnv,
|
||||
pass_to_pageserver: bool,
|
||||
}
|
||||
|
||||
impl WalAcceptorNode {
|
||||
pub fn init(&self) {
|
||||
if self.data_dir.exists() {
|
||||
fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!(
|
||||
"Starting wal_acceptor in {} listening '{}'",
|
||||
self.data_dir.to_str().unwrap(),
|
||||
self.listen
|
||||
);
|
||||
|
||||
let ps_arg = if self.pass_to_pageserver {
|
||||
// Tell page server it can receive WAL from this WAL safekeeper
|
||||
["--pageserver", "127.0.0.1:64000"].to_vec()
|
||||
} else {
|
||||
[].to_vec()
|
||||
};
|
||||
|
||||
let status = Command::new(
|
||||
self.env
|
||||
.zenith_distrib_dir
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.join("wal_acceptor"),
|
||||
)
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.listen.to_string().as_str()])
|
||||
.args(&["--systemid", self.systemid.to_string().as_str()])
|
||||
.args(&ps_arg)
|
||||
.arg("-d")
|
||||
.arg("-n")
|
||||
.status()
|
||||
.expect("failed to start wal_acceptor");
|
||||
|
||||
if !status.success() {
|
||||
panic!("wal_acceptor start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
println!("Stopping wal acceptor on {}", self.listen);
|
||||
let pidfile = self.data_dir.join("wal_acceptor.pid");
|
||||
let pid = read_pidfile(&pidfile)?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to kill wal_acceptor with pid {}", pid);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalAcceptorNode {
|
||||
fn drop(&mut self) {
|
||||
// Ignore errors.
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// WalProposerNode
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct WalProposerNode {
|
||||
pub pid: u32,
|
||||
}
|
||||
|
||||
impl WalProposerNode {
|
||||
pub fn stop(&self) {
|
||||
// std::process::Child::id() returns u32, we need i32.
|
||||
let pid: i32 = self.pid.try_into().unwrap();
|
||||
let pid = Pid::from_raw(pid);
|
||||
kill(pid, Signal::SIGTERM).expect("failed to execute kill");
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalProposerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
@@ -1,332 +0,0 @@
|
||||
use rand::Rng;
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use std::{thread, time};
|
||||
|
||||
use control_plane::compute::{ComputeControlPlane, PostgresNode};
|
||||
|
||||
use integration_tests;
|
||||
use integration_tests::PostgresNodeExt;
|
||||
use integration_tests::TestStorageControlPlane;
|
||||
|
||||
const DOWNTIME: u64 = 2;
|
||||
|
||||
fn start_node_with_wal_proposer(
|
||||
timeline: &str,
|
||||
compute_cplane: &mut ComputeControlPlane,
|
||||
wal_acceptors: &String,
|
||||
) -> Arc<PostgresNode> {
|
||||
let node = compute_cplane.new_test_master_node(timeline);
|
||||
let _node = node.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!("wal_acceptors='{}'\n", wal_acceptors),
|
||||
);
|
||||
node.start().unwrap();
|
||||
node
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_embedded_wal_proposer() {
|
||||
let local_env = integration_tests::create_test_env("test_embedded_wal_proposer");
|
||||
|
||||
const REDUNDANCY: usize = 3;
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
// check wal files equality
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_acceptors_normal_work() {
|
||||
let local_env = integration_tests::create_test_env("test_acceptors_normal_work");
|
||||
|
||||
const REDUNDANCY: usize = 3;
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
// check wal files equality
|
||||
}
|
||||
|
||||
// Run page server and multiple safekeepers, and multiple compute nodes running
|
||||
// against different timelines.
|
||||
#[test]
|
||||
fn test_many_timelines() {
|
||||
// Initialize a new repository, and set up WAL safekeepers and page server.
|
||||
const REDUNDANCY: usize = 3;
|
||||
const N_TIMELINES: usize = 5;
|
||||
let local_env = integration_tests::create_test_env("test_many_timelines");
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// Create branches
|
||||
let mut timelines: Vec<String> = Vec::new();
|
||||
timelines.push("main".to_string());
|
||||
|
||||
for i in 1..N_TIMELINES {
|
||||
let branchname = format!("experimental{}", i);
|
||||
storage_cplane
|
||||
.pageserver
|
||||
.branch_create(&branchname, "main")
|
||||
.unwrap();
|
||||
timelines.push(branchname);
|
||||
}
|
||||
|
||||
// start postgres on each timeline
|
||||
let mut nodes = Vec::new();
|
||||
for tli_name in timelines {
|
||||
let node = start_node_with_wal_proposer(&tli_name, &mut compute_cplane, &wal_acceptors);
|
||||
nodes.push(node.clone());
|
||||
}
|
||||
|
||||
// create schema
|
||||
for node in &nodes {
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
}
|
||||
|
||||
// Populate data
|
||||
for node in &nodes {
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
}
|
||||
|
||||
// Check data
|
||||
for node in &nodes {
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
}
|
||||
}
|
||||
|
||||
// Majority is always alive
|
||||
#[test]
|
||||
fn test_acceptors_restarts() {
|
||||
let local_env = integration_tests::create_test_env("test_acceptors_restarts");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
const FAULT_PROBABILITY: f32 = 0.01;
|
||||
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// start postgres
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
let mut failed_node: Option<usize> = None;
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
let mut psql = node.open_psql("postgres");
|
||||
for i in 1..=1000 {
|
||||
psql.execute("INSERT INTO t values ($1, 'payload')", &[&i])
|
||||
.unwrap();
|
||||
let prob: f32 = rng.gen();
|
||||
if prob <= FAULT_PROBABILITY {
|
||||
if let Some(node) = failed_node {
|
||||
storage_cplane.wal_acceptors[node].start();
|
||||
failed_node = None;
|
||||
} else {
|
||||
let node: usize = rng.gen_range(0..REDUNDANCY);
|
||||
failed_node = Some(node);
|
||||
storage_cplane.wal_acceptors[node].stop().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 500500);
|
||||
}
|
||||
|
||||
fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
|
||||
let cp = cplane.clone();
|
||||
thread::spawn(move || {
|
||||
thread::sleep(time::Duration::from_secs(DOWNTIME));
|
||||
cp.wal_acceptors[no].start();
|
||||
});
|
||||
}
|
||||
|
||||
// Stop majority of acceptors while compute is under the load. Boot
|
||||
// them again and check that nothing was losed. Repeat.
|
||||
// N_CRASHES env var
|
||||
#[test]
|
||||
fn test_acceptors_unavailability() {
|
||||
let local_env = integration_tests::create_test_env("test_acceptors_unavailability");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 2;
|
||||
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
let mut psql = node.open_psql("postgres");
|
||||
psql.execute("INSERT INTO t values (1, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
// Shut down all wal acceptors
|
||||
storage_cplane.wal_acceptors[0].stop().unwrap();
|
||||
let cp = Arc::new(storage_cplane);
|
||||
start_acceptor(&cp, 0);
|
||||
let now = SystemTime::now();
|
||||
psql.execute("INSERT INTO t values (2, 'payload')", &[])
|
||||
.unwrap();
|
||||
// Here we check that the query above was hanging
|
||||
// while wal_acceptor was unavailiable
|
||||
assert!(now.elapsed().unwrap().as_secs() >= DOWNTIME);
|
||||
psql.execute("INSERT INTO t values (3, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
cp.wal_acceptors[1].stop().unwrap();
|
||||
start_acceptor(&cp, 1);
|
||||
psql.execute("INSERT INTO t values (4, 'payload')", &[])
|
||||
.unwrap();
|
||||
// Here we check that the query above was hanging
|
||||
// while wal_acceptor was unavailiable
|
||||
assert!(now.elapsed().unwrap().as_secs() >= 2 * DOWNTIME);
|
||||
|
||||
psql.execute("INSERT INTO t values (5, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
// Ensure that all inserts succeeded.
|
||||
// Including ones that were waiting for wal acceptor restart.
|
||||
assert_eq!(count, 15);
|
||||
}
|
||||
|
||||
fn simulate_failures(cplane: Arc<TestStorageControlPlane>) {
|
||||
let mut rng = rand::thread_rng();
|
||||
let n_acceptors = cplane.wal_acceptors.len();
|
||||
let failure_period = time::Duration::from_secs(1);
|
||||
while cplane.is_running() {
|
||||
thread::sleep(failure_period);
|
||||
let mask: u32 = rng.gen_range(0..(1 << n_acceptors));
|
||||
for i in 0..n_acceptors {
|
||||
if (mask & (1 << i)) != 0 {
|
||||
cplane.wal_acceptors[i].stop().unwrap();
|
||||
}
|
||||
}
|
||||
thread::sleep(failure_period);
|
||||
for i in 0..n_acceptors {
|
||||
if (mask & (1 << i)) != 0 {
|
||||
cplane.wal_acceptors[i].start();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Race condition test
|
||||
#[test]
|
||||
fn test_race_conditions() {
|
||||
let local_env = integration_tests::create_test_env("test_race_conditions");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
|
||||
let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(
|
||||
&local_env, REDUNDANCY,
|
||||
));
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
|
||||
let cp = storage_cplane.clone();
|
||||
let failures_thread = thread::spawn(move || {
|
||||
simulate_failures(cp);
|
||||
});
|
||||
|
||||
let mut psql = node.open_psql("postgres");
|
||||
for i in 1..=1000 {
|
||||
psql.execute("INSERT INTO t values ($1, 'payload')", &[&i])
|
||||
.unwrap();
|
||||
}
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 500500);
|
||||
|
||||
storage_cplane.stop();
|
||||
failures_thread.join().unwrap();
|
||||
}
|
||||
@@ -10,7 +10,7 @@ edition = "2018"
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
bytes = { version = "1.0.1", features = ['serde'] }
|
||||
byteorder = "1.4.3"
|
||||
futures = "0.3.13"
|
||||
lazy_static = "1.4.0"
|
||||
@@ -30,17 +30,19 @@ tokio-stream = { version = "0.1.4" }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
rocksdb = "0.16.0"
|
||||
# by default rust-rocksdb tries to build a lot of compression algos. Use lz4 only for now as it is simplest dependency.
|
||||
rocksdb = { version = "0.16.0", features = ["lz4"], default-features = false }
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
walkdir = "2"
|
||||
thiserror = "1.0"
|
||||
hex = "0.4.3"
|
||||
tar = "0.4.33"
|
||||
parse_duration = "2.1.1"
|
||||
humantime = "2.1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
fs_extra = "1.2.0"
|
||||
toml = "0.5"
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
@@ -1,82 +1,4 @@
|
||||
Page Server
|
||||
===========
|
||||
|
||||
|
||||
How to test
|
||||
-----------
|
||||
|
||||
|
||||
1. Compile and install Postgres from this repository (there are
|
||||
modifications, so vanilla Postgres won't do)
|
||||
|
||||
./configure --prefix=/home/heikki/zenith-install
|
||||
|
||||
2. Compile the page server
|
||||
|
||||
cd pageserver
|
||||
cargo build
|
||||
|
||||
3. Create another "dummy" cluster that will be used by the page server when it applies
|
||||
the WAL records. (shouldn't really need this, getting rid of it is a TODO):
|
||||
|
||||
/home/heikki/zenith-install/bin/initdb -D /data/zenith-dummy
|
||||
|
||||
|
||||
4. Initialize and start a new postgres cluster
|
||||
|
||||
/home/heikki/zenith-install/bin/initdb -D /data/zenith-test-db --username=postgres
|
||||
/home/heikki/zenith-install/bin/postgres -D /data/zenith-test-db
|
||||
|
||||
5. In another terminal, start the page server.
|
||||
|
||||
PGDATA=/data/zenith-dummy PATH=/home/heikki/zenith-install/bin:$PATH ./target/debug/pageserver
|
||||
|
||||
It should connect to the postgres instance using streaming replication, and print something
|
||||
like this:
|
||||
|
||||
$ PGDATA=/data/zenith-dummy PATH=/home/heikki/zenith-install/bin:$PATH ./target/debug/pageserver
|
||||
Starting WAL receiver
|
||||
connecting...
|
||||
Starting page server on 127.0.0.1:5430
|
||||
connected!
|
||||
page cache is empty
|
||||
|
||||
6. You can now open another terminal and issue DDL commands. Generated WAL records will
|
||||
be streamed to the page servers, and attached to blocks that they apply to in its
|
||||
page cache
|
||||
|
||||
$ psql postgres -U postgres
|
||||
psql (14devel)
|
||||
Type "help" for help.
|
||||
|
||||
postgres=# create table mydata (i int4);
|
||||
CREATE TABLE
|
||||
postgres=# insert into mydata select g from generate_series(1,100) g;
|
||||
INSERT 0 100
|
||||
postgres=#
|
||||
|
||||
7. The GetPage@LSN interface to the compute nodes isn't working yet, but to simulate
|
||||
that, the page server generates a test GetPage@LSN call every 5 seconds on a random
|
||||
block that's in the page cache. In a few seconds, you should see output from that:
|
||||
|
||||
testing GetPage@LSN for block 0
|
||||
WAL record at LSN 23584576 initializes the page
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167DF40
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167DF80
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167DFC0
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167E018
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167E058
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167E098
|
||||
2021-03-19 11:03:13.791 EET [11439] LOG: applied WAL record at 0/167E0D8
|
||||
2021-03-19 11:03:13.792 EET [11439] LOG: applied WAL record at 0/167E118
|
||||
2021-03-19 11:03:13.792 EET [11439] LOG: applied WAL record at 0/167E158
|
||||
2021-03-19 11:03:13.792 EET [11439] LOG: applied WAL record at 0/167E198
|
||||
applied 10 WAL records to produce page image at LSN 18446744073709547246
|
||||
|
||||
|
||||
|
||||
Architecture
|
||||
============
|
||||
## Page server architecture
|
||||
|
||||
The Page Server is responsible for all operations on a number of
|
||||
"chunks" of relation data. A chunk corresponds to a PostgreSQL
|
||||
@@ -84,8 +6,10 @@ relation segment (i.e. one max. 1 GB file in the data directory), but
|
||||
it holds all the different versions of every page in the segment that
|
||||
are still needed by the system.
|
||||
|
||||
Determining which chunk each Page Server holds is handled elsewhere. (TODO:
|
||||
currently, there is only one Page Server which holds all chunks)
|
||||
Currently we do not specifically organize data in chunks.
|
||||
All page images and corresponding WAL records are stored as entries in a key-value storage,
|
||||
where StorageKey is a zenith_timeline_id + BufferTag + LSN.
|
||||
|
||||
|
||||
The Page Server has a few different duties:
|
||||
|
||||
@@ -154,11 +78,33 @@ and stores them to the page cache.
|
||||
Page Cache
|
||||
----------
|
||||
|
||||
The Page Cache is a data structure, to hold all the different page versions.
|
||||
It is accessed by all the other threads, to perform their duties.
|
||||
The Page Cache is a switchboard to access different Repositories.
|
||||
|
||||
Currently, the page cache is implemented fully in-memory. TODO: Store it
|
||||
on disk. Define a file format.
|
||||
#### Repository
|
||||
Repository corresponds to one .zenith directory.
|
||||
Repository is needed to manage Timelines.
|
||||
|
||||
#### Timeline
|
||||
Timeline is a page cache workhorse that accepts page changes
|
||||
and serves get_page_at_lsn() and get_rel_size() requests.
|
||||
Note: this has nothing to do with PostgreSQL WAL timeline.
|
||||
|
||||
#### Branch
|
||||
We can create branch at certain LSN.
|
||||
Each Branch lives in a corresponding timeline and has an ancestor.
|
||||
|
||||
To get full snapshot of data at certain moment we need to traverse timeline and its ancestors.
|
||||
|
||||
#### ObjectRepository
|
||||
ObjectRepository implements Repository and has associated ObjectStore and WAL redo service.
|
||||
|
||||
#### ObjectStore
|
||||
ObjectStore is an interface for key-value store for page images and wal records.
|
||||
Currently it has one implementation - RocksDB.
|
||||
|
||||
#### WAL redo service
|
||||
WAL redo service - service that runs PostgreSQL in a special wal_redo mode
|
||||
to apply given WAL records over an old page image and return new page image.
|
||||
|
||||
|
||||
TODO: Garbage Collection / Compaction
|
||||
@@ -177,3 +123,7 @@ The backup service is responsible for periodically pushing the chunks to S3.
|
||||
TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
|
||||
a chunk we don't currently have? Or when an external Control Plane tells us?
|
||||
|
||||
TODO: Sharding
|
||||
--------------------
|
||||
|
||||
We should be able to run multiple Page Servers that handle sharded data.
|
||||
|
||||
@@ -1,62 +0,0 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Set up a simple Compute Node + Page Server combination locally.
|
||||
#
|
||||
# NOTE: This doesn't clean up between invocations. You'll need to manually:
|
||||
#
|
||||
# - Kill any previous 'postgres' and 'pageserver' processes
|
||||
# - Clear the S3 bucket
|
||||
# - Remove the 'zenith-pgdata' directory
|
||||
|
||||
|
||||
set -e
|
||||
|
||||
# Set up some config.
|
||||
#
|
||||
# CHANGE THESE ACCORDING TO YOUR S3 INSTALLATION
|
||||
export S3_REGION=auto
|
||||
export S3_ENDPOINT=https://localhost:9000
|
||||
export S3_ACCESSKEY=minioadmin
|
||||
export S3_SECRET=pikkunen
|
||||
export S3_BUCKET=zenith-testbucket
|
||||
|
||||
|
||||
COMPUTE_NODE_PGDATA=zenith-pgdata
|
||||
|
||||
|
||||
# 1. Initialize a cluster.
|
||||
initdb -D $COMPUTE_NODE_PGDATA -U zenith
|
||||
|
||||
echo "port=65432" >> $COMPUTE_NODE_PGDATA/postgresql.conf
|
||||
echo "log_connections=on" >> $COMPUTE_NODE_PGDATA/postgresql.conf
|
||||
|
||||
# Use a small shared_buffers, so that we hit the Page Server more
|
||||
# easily.
|
||||
echo "shared_buffers = 1MB" >> $COMPUTE_NODE_PGDATA/postgresql.conf
|
||||
|
||||
# TODO: page server should use a replication slot, or some other mechanism
|
||||
# to make sure that the primary doesn't lose data that the page server still
|
||||
# needs. (The WAL safekeepers should ensure that)
|
||||
echo "wal_keep_size=10GB" >> $COMPUTE_NODE_PGDATA/postgresql.conf
|
||||
|
||||
# Tell the Postgres server how to connect to the Page Server
|
||||
echo "page_server_connstring='host=localhost port=5430'" >> $COMPUTE_NODE_PGDATA/postgresql.conf
|
||||
|
||||
|
||||
# 2. Run zenith_push to push a base backup fo the database to an S3 bucket. The
|
||||
# Page Server will read it from there
|
||||
zenith_push -D $COMPUTE_NODE_PGDATA
|
||||
|
||||
|
||||
# 3. Launch page server
|
||||
rm -rf /tmp/pgdata-dummy
|
||||
initdb -N -D /tmp/pgdata-dummy
|
||||
PGDATA=/tmp/pgdata-dummy ./target/debug/pageserver &
|
||||
|
||||
# 4. Start up the Postgres server
|
||||
postgres -D $COMPUTE_NODE_PGDATA &
|
||||
|
||||
|
||||
echo "ALL SET! You can now connect to Postgres with something like:"
|
||||
echo ""
|
||||
echo 'psql "dbname=postgres host=localhost user=zenith port=65432"'
|
||||
@@ -1,350 +1,275 @@
|
||||
//!
|
||||
//! Generate a tarball with files needed to bootstrap ComputeNode.
|
||||
//!
|
||||
//! TODO: this module has nothing to do with PostgreSQL pg_basebackup.
|
||||
//! It could use a better name.
|
||||
//!
|
||||
//! Stateless Postgres compute node is launched by sending tarball which contains non-relational data (multixacts, clog, filenodemaps, twophase files)
|
||||
//! and generate pg_control and dummy segment of WAL. This module is responsible for creation of such tarball from snapshot directory and
|
||||
//! data stored in object storage.
|
||||
//!
|
||||
use crate::ZTimelineId;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use log::*;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use tar::{Builder, Header};
|
||||
use walkdir::WalkDir;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
|
||||
use crate::repository::{BufferTag, RelTag, Timeline};
|
||||
use crate::object_key::*;
|
||||
use crate::repository::Timeline;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
fn new_tar_header(path: &str, size: u64) -> anyhow::Result<Header> {
|
||||
let mut header = Header::new_gnu();
|
||||
header.set_size(size);
|
||||
header.set_path(path)?;
|
||||
header.set_mode(0b110000000);
|
||||
header.set_mtime(
|
||||
SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
);
|
||||
header.set_cksum();
|
||||
Ok(header)
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between various functions
|
||||
/// used for constructing tarball.
|
||||
pub struct Basebackup<'a> {
|
||||
ar: Builder<&'a mut dyn Write>,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
snappath: String,
|
||||
slru_buf: [u8; pg_constants::SLRU_SEG_SIZE],
|
||||
slru_segno: u32,
|
||||
slru_path: &'static str,
|
||||
}
|
||||
|
||||
//
|
||||
// Generate SRLU segment files from repository
|
||||
//
|
||||
fn add_slru_segments(
|
||||
ar: &mut Builder<&mut dyn Write>,
|
||||
timeline: &Arc<dyn Timeline>,
|
||||
path: &str,
|
||||
forknum: u8,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let rel = RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum,
|
||||
};
|
||||
let (first, last) = timeline.get_range(rel, lsn)?;
|
||||
const SEG_SIZE: usize =
|
||||
pg_constants::BLCKSZ as usize * pg_constants::SLRU_PAGES_PER_SEGMENT as usize;
|
||||
let mut seg_buf = [0u8; SEG_SIZE];
|
||||
let mut curr_segno: Option<u32> = None;
|
||||
for page in first..last {
|
||||
let tag = BufferTag { rel, blknum: page };
|
||||
let img = timeline.get_page_at_lsn(tag, lsn)?;
|
||||
// Zero length image indicates truncated segment: just skip it
|
||||
if img.len() != 0 {
|
||||
assert!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
impl<'a> Basebackup<'a> {
|
||||
pub fn new(
|
||||
write: &'a mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
snapshot_lsn: Lsn,
|
||||
) -> Basebackup<'a> {
|
||||
Basebackup {
|
||||
ar: Builder::new(write),
|
||||
timeline,
|
||||
lsn,
|
||||
prev_record_lsn,
|
||||
snappath: format!("timelines/{}/snapshots/{:016X}", timelineid, snapshot_lsn.0),
|
||||
slru_path: "",
|
||||
slru_segno: u32::MAX,
|
||||
slru_buf: [0u8; pg_constants::SLRU_SEG_SIZE],
|
||||
}
|
||||
}
|
||||
|
||||
let segno = page / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if curr_segno.is_some() && curr_segno.unwrap() != segno {
|
||||
let segname = format!("{}/{:>04X}", path, curr_segno.unwrap());
|
||||
let header = new_tar_header(&segname, SEG_SIZE as u64)?;
|
||||
ar.append(&header, &seg_buf[..])?;
|
||||
seg_buf = [0u8; SEG_SIZE];
|
||||
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
|
||||
debug!("sending tarball of snapshot in {}", self.snappath);
|
||||
for entry in WalkDir::new(&self.snappath) {
|
||||
let entry = entry?;
|
||||
let fullpath = entry.path();
|
||||
let relpath = entry.path().strip_prefix(&self.snappath).unwrap();
|
||||
|
||||
if relpath.to_str().unwrap() == "" {
|
||||
continue;
|
||||
}
|
||||
curr_segno = Some(segno);
|
||||
|
||||
if entry.file_type().is_dir() {
|
||||
trace!(
|
||||
"sending dir {} as {}",
|
||||
fullpath.display(),
|
||||
relpath.display()
|
||||
);
|
||||
self.ar.append_dir(relpath, fullpath)?;
|
||||
} else if entry.file_type().is_symlink() {
|
||||
error!("ignoring symlink in snapshot dir");
|
||||
} else if entry.file_type().is_file() {
|
||||
if !is_rel_file_path(relpath.to_str().unwrap()) {
|
||||
if entry.file_name() != "pg_filenode.map" // this files will be generated from object storage
|
||||
&& !relpath.starts_with("pg_xact/")
|
||||
&& !relpath.starts_with("pg_multixact/")
|
||||
{
|
||||
trace!("sending {}", relpath.display());
|
||||
self.ar.append_path_with_name(fullpath, relpath)?;
|
||||
}
|
||||
} else {
|
||||
// relation pages are loaded on demand and should not be included in tarball
|
||||
trace!("not sending {}", relpath.display());
|
||||
}
|
||||
} else {
|
||||
error!("unknown file type: {}", fullpath.display());
|
||||
}
|
||||
}
|
||||
|
||||
// Generate non-relational files.
|
||||
// Iteration is sorted order: all objects of the same time are grouped and traversed
|
||||
// in key ascending order. For example all pg_xact records precede pg_multixact records and are sorted by block number.
|
||||
// It allows to easily construct SLRU segments (32 blocks).
|
||||
for obj in self.timeline.list_nonrels(self.lsn)? {
|
||||
match obj {
|
||||
ObjectTag::Clog(slru) => self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
|
||||
ObjectTag::MultiXactMembers(slru) => {
|
||||
self.add_slru_segment("pg_multixact/members", &obj, slru.blknum)?
|
||||
}
|
||||
ObjectTag::MultiXactOffsets(slru) => {
|
||||
self.add_slru_segment("pg_multixact/offsets", &obj, slru.blknum)?
|
||||
}
|
||||
ObjectTag::FileNodeMap(db) => self.add_relmap_file(&obj, &db)?,
|
||||
ObjectTag::TwoPhase(prepare) => self.add_twophase_file(&obj, prepare.xid)?,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
self.finish_slru_segment()?; // write last non-completed SLRU segment (if any)
|
||||
self.add_pgcontrol_file()?;
|
||||
self.ar.finish()?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Generate SLRU segment files from repository. Path identifies SLRU kind (pg_xact, pg_multixact/members, ...).
|
||||
// Intially pass an empty string.
|
||||
//
|
||||
fn add_slru_segment(
|
||||
&mut self,
|
||||
path: &'static str,
|
||||
tag: &ObjectTag,
|
||||
page: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_page_at_lsn_nowait(*tag, self.lsn, false)?;
|
||||
// Zero length image indicates truncated segment: just skip it
|
||||
if !img.is_empty() {
|
||||
assert!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
let segno = page / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if self.slru_path != "" && (self.slru_segno != segno || self.slru_path != path) {
|
||||
// Switch to new segment: save old one
|
||||
let segname = format!("{}/{:>04X}", self.slru_path, self.slru_segno);
|
||||
let header = new_tar_header(&segname, pg_constants::SLRU_SEG_SIZE as u64)?;
|
||||
self.ar.append(&header, &self.slru_buf[..])?;
|
||||
self.slru_buf = [0u8; pg_constants::SLRU_SEG_SIZE]; // reinitialize segment buffer
|
||||
}
|
||||
self.slru_segno = segno;
|
||||
self.slru_path = path;
|
||||
let offs_start = (page % pg_constants::SLRU_PAGES_PER_SEGMENT) as usize
|
||||
* pg_constants::BLCKSZ as usize;
|
||||
let offs_end = offs_start + pg_constants::BLCKSZ as usize;
|
||||
seg_buf[offs_start..offs_end].copy_from_slice(&img);
|
||||
self.slru_buf[offs_start..offs_end].copy_from_slice(&img);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
if curr_segno.is_some() {
|
||||
let segname = format!("{}/{:>04X}", path, curr_segno.unwrap());
|
||||
let header = new_tar_header(&segname, SEG_SIZE as u64)?;
|
||||
ar.append(&header, &seg_buf[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Extract pg_filenode.map files from repository
|
||||
//
|
||||
fn add_relmap_files(
|
||||
ar: &mut Builder<&mut dyn Write>,
|
||||
timeline: &Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
snappath: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
for db in timeline.get_databases(lsn)?.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: *db,
|
||||
blknum: 0,
|
||||
};
|
||||
let img = timeline.get_page_at_lsn(tag, lsn)?;
|
||||
//
|
||||
// We flush SLRU segments to the tarball once them are completed.
|
||||
// This method is used to flush last (may be incompleted) segment.
|
||||
//
|
||||
fn finish_slru_segment(&mut self) -> anyhow::Result<()> {
|
||||
if self.slru_path != "" {
|
||||
// is there is some incompleted segment
|
||||
let segname = format!("{}/{:>04X}", self.slru_path, self.slru_segno);
|
||||
let header = new_tar_header(&segname, pg_constants::SLRU_SEG_SIZE as u64)?;
|
||||
self.ar.append(&header, &self.slru_buf[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Extract pg_filenode.map files from repository
|
||||
//
|
||||
fn add_relmap_file(&mut self, tag: &ObjectTag, db: &DatabaseTag) -> anyhow::Result<()> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_page_at_lsn_nowait(*tag, self.lsn, false)?;
|
||||
info!("add_relmap_file {:?}", db);
|
||||
let path = if db.spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
String::from("global/pg_filenode.map")
|
||||
String::from("global/pg_filenode.map") // filenode map for global tablespace
|
||||
} else {
|
||||
// User defined tablespaces are not supported
|
||||
assert!(db.spcnode == pg_constants::DEFAULTTABLESPACE_OID);
|
||||
let src_path = format!("{}/base/1/PG_VERSION", snappath);
|
||||
let src_path = format!("{}/base/1/PG_VERSION", self.snappath);
|
||||
let dst_path = format!("base/{}/PG_VERSION", db.dbnode);
|
||||
ar.append_path_with_name(&src_path, &dst_path)?;
|
||||
self.ar.append_path_with_name(&src_path, &dst_path)?;
|
||||
format!("base/{}/pg_filenode.map", db.dbnode)
|
||||
};
|
||||
assert!(img.len() == 512);
|
||||
let header = new_tar_header(&path, img.len() as u64)?;
|
||||
ar.append(&header, &img[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Extract twophase state files
|
||||
//
|
||||
fn add_twophase_files(
|
||||
ar: &mut Builder<&mut dyn Write>,
|
||||
timeline: &Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
for xid in timeline.get_twophase(lsn)?.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_TWOPHASE_FORKNUM,
|
||||
},
|
||||
blknum: *xid,
|
||||
};
|
||||
let img = timeline.get_page_at_lsn(tag, lsn)?;
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
let crc = crc32c::crc32c(&img[..]);
|
||||
buf.put_u32_le(crc);
|
||||
let path = format!("pg_twophase/{:>08X}", xid);
|
||||
let header = new_tar_header(&path, buf.len() as u64)?;
|
||||
ar.append(&header, &buf[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Add generated pg_control file
|
||||
//
|
||||
fn add_pgcontrol_file(
|
||||
ar: &mut Builder<&mut dyn Write>,
|
||||
timeline: &Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
if let Some(checkpoint_bytes) =
|
||||
timeline.get_page_image(BufferTag::fork(pg_constants::PG_CHECKPOINT_FORKNUM), Lsn(0))?
|
||||
{
|
||||
if let Some(pg_control_bytes) = timeline.get_page_image(
|
||||
BufferTag::fork(pg_constants::PG_CONTROLFILE_FORKNUM),
|
||||
Lsn(0),
|
||||
)? {
|
||||
let mut pg_control = postgres_ffi::decode_pg_control(pg_control_bytes)?;
|
||||
let mut checkpoint = postgres_ffi::decode_checkpoint(checkpoint_bytes)?;
|
||||
|
||||
checkpoint.redo = lsn.0;
|
||||
checkpoint.nextXid.value += 1;
|
||||
// TODO: When we restart master there are no active transaction and oldestXid is
|
||||
// equal to nextXid if there are no prepared transactions.
|
||||
// Let's ignore them for a while...
|
||||
checkpoint.oldestXid = checkpoint.nextXid.value as u32;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
let pg_control_bytes = postgres_ffi::encode_pg_control(pg_control);
|
||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||
ar.append(&header, &pg_control_bytes[..])?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Generate tarball with non-relational files from repository
|
||||
///
|
||||
pub fn send_tarball_at_lsn(
|
||||
write: &mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
timeline: &Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
snapshot_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut ar = Builder::new(write);
|
||||
|
||||
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshot_lsn.0);
|
||||
|
||||
debug!("sending tarball of snapshot in {}", snappath);
|
||||
for entry in WalkDir::new(&snappath) {
|
||||
let entry = entry?;
|
||||
let fullpath = entry.path();
|
||||
let relpath = entry.path().strip_prefix(&snappath).unwrap();
|
||||
|
||||
if relpath.to_str().unwrap() == "" {
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.file_type().is_dir() {
|
||||
trace!(
|
||||
"sending dir {} as {}",
|
||||
fullpath.display(),
|
||||
relpath.display()
|
||||
);
|
||||
ar.append_dir(relpath, fullpath)?;
|
||||
} else if entry.file_type().is_symlink() {
|
||||
error!("ignoring symlink in snapshot dir");
|
||||
} else if entry.file_type().is_file() {
|
||||
// Shared catalogs are exempt
|
||||
if relpath.starts_with("global/") {
|
||||
trace!("sending shared catalog {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else if !is_rel_file_path(relpath.to_str().unwrap()) {
|
||||
if entry.file_name() != "pg_filenode.map"
|
||||
&& entry.file_name() != "pg_control"
|
||||
&& !relpath.starts_with("pg_xact/")
|
||||
&& !relpath.starts_with("pg_multixact/")
|
||||
{
|
||||
trace!("sending {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
}
|
||||
} else {
|
||||
trace!("not sending {}", relpath.display());
|
||||
}
|
||||
} else {
|
||||
error!("unknown file type: {}", fullpath.display());
|
||||
}
|
||||
self.ar.append(&header, &img[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
add_slru_segments(
|
||||
&mut ar,
|
||||
timeline,
|
||||
"pg_xact",
|
||||
pg_constants::PG_XACT_FORKNUM,
|
||||
lsn,
|
||||
)?;
|
||||
add_slru_segments(
|
||||
&mut ar,
|
||||
timeline,
|
||||
"pg_multixact/members",
|
||||
pg_constants::PG_MXACT_MEMBERS_FORKNUM,
|
||||
lsn,
|
||||
)?;
|
||||
add_slru_segments(
|
||||
&mut ar,
|
||||
timeline,
|
||||
"pg_multixact/offsets",
|
||||
pg_constants::PG_MXACT_OFFSETS_FORKNUM,
|
||||
lsn,
|
||||
)?;
|
||||
add_relmap_files(&mut ar, timeline, lsn, &snappath)?;
|
||||
add_twophase_files(&mut ar, timeline, lsn)?;
|
||||
add_pgcontrol_file(&mut ar, timeline, lsn)?;
|
||||
|
||||
ar.finish()?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Send a tarball containing a snapshot of all non-relation files in the
|
||||
/// PostgreSQL data directory, at given LSN
|
||||
///
|
||||
/// There must be a snapshot at the given LSN in the snapshots directory, we cannot
|
||||
/// reconstruct the state at an arbitrary LSN at the moment.
|
||||
///
|
||||
pub fn send_snapshot_tarball(
|
||||
write: &mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
snapshotlsn: Lsn,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut ar = Builder::new(write);
|
||||
|
||||
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn.0);
|
||||
let walpath = format!("timelines/{}/wal", timelineid);
|
||||
|
||||
debug!("sending tarball of snapshot in {}", snappath);
|
||||
//ar.append_dir_all("", &snappath)?;
|
||||
|
||||
for entry in WalkDir::new(&snappath) {
|
||||
let entry = entry?;
|
||||
let fullpath = entry.path();
|
||||
let relpath = entry.path().strip_prefix(&snappath).unwrap();
|
||||
|
||||
if relpath.to_str().unwrap() == "" {
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.file_type().is_dir() {
|
||||
trace!(
|
||||
"sending dir {} as {}",
|
||||
fullpath.display(),
|
||||
relpath.display()
|
||||
);
|
||||
ar.append_dir(relpath, fullpath)?;
|
||||
} else if entry.file_type().is_symlink() {
|
||||
error!("ignoring symlink in snapshot dir");
|
||||
} else if entry.file_type().is_file() {
|
||||
// Shared catalogs are exempt
|
||||
if relpath.starts_with("global/") {
|
||||
trace!("sending shared catalog {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else if !is_rel_file_path(relpath.to_str().unwrap()) {
|
||||
trace!("sending {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else {
|
||||
trace!("not sending {}", relpath.display());
|
||||
|
||||
// FIXME: For now, also send all the relation files.
|
||||
// This really shouldn't be necessary, and kind of
|
||||
// defeats the point of having a page server in the
|
||||
// first place. But it is useful at least when
|
||||
// debugging with the DEBUG_COMPARE_LOCAL option (see
|
||||
// vendor/postgres/src/backend/storage/smgr/pagestore_smgr.c)
|
||||
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
}
|
||||
} else {
|
||||
error!("unknown file type: {}", fullpath.display());
|
||||
//
|
||||
// Extract twophase state files
|
||||
//
|
||||
fn add_twophase_file(&mut self, tag: &ObjectTag, xid: TransactionId) -> anyhow::Result<()> {
|
||||
// Include in tarball two-phase files only of in-progress transactions
|
||||
if self.timeline.get_tx_status(xid, self.lsn)?
|
||||
== pg_constants::TRANSACTION_STATUS_IN_PROGRESS
|
||||
{
|
||||
let img = self
|
||||
.timeline
|
||||
.get_page_at_lsn_nowait(*tag, self.lsn, false)?;
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
let crc = crc32c::crc32c(&img[..]);
|
||||
buf.put_u32_le(crc);
|
||||
let path = format!("pg_twophase/{:>08X}", xid);
|
||||
let header = new_tar_header(&path, buf.len() as u64)?;
|
||||
self.ar.append(&header, &buf[..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// FIXME: Also send all the WAL. The compute node would only need
|
||||
// the WAL that applies to non-relation files, because the page
|
||||
// server handles all the relation files. But we don't have a
|
||||
// mechanism for separating relation and non-relation WAL at the
|
||||
// moment.
|
||||
for entry in std::fs::read_dir(&walpath)? {
|
||||
let entry = entry?;
|
||||
let fullpath = &entry.path();
|
||||
let relpath = fullpath.strip_prefix(&walpath).unwrap();
|
||||
//
|
||||
// Add generated pg_control file
|
||||
//
|
||||
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
let checkpoint_bytes =
|
||||
self.timeline
|
||||
.get_page_at_lsn_nowait(ObjectTag::Checkpoint, self.lsn, false)?;
|
||||
let pg_control_bytes =
|
||||
self.timeline
|
||||
.get_page_at_lsn_nowait(ObjectTag::ControlFile, self.lsn, false)?;
|
||||
let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
if !entry.path().is_file() {
|
||||
continue;
|
||||
}
|
||||
// Generate new pg_control and WAL needed for bootstrap
|
||||
let checkpoint_segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let checkpoint_lsn = XLogSegNoOffsetToRecPtr(
|
||||
checkpoint_segno,
|
||||
XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
);
|
||||
checkpoint.redo = self.lsn.0 + self.lsn.calc_padding(8u32);
|
||||
|
||||
let archive_fname = relpath.to_str().unwrap();
|
||||
let archive_fname = archive_fname
|
||||
.strip_suffix(".partial")
|
||||
.unwrap_or(&archive_fname);
|
||||
let archive_path = "pg_wal/".to_owned() + archive_fname;
|
||||
ar.append_path_with_name(fullpath, archive_path)?;
|
||||
//reset some fields we don't want to preserve
|
||||
checkpoint.oldestActiveXid = 0;
|
||||
|
||||
//save new values in pg_control
|
||||
pg_control.checkPoint = checkpoint_lsn;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
info!("pg_control.state = {}", pg_control.state);
|
||||
pg_control.state = pg_constants::DB_SHUTDOWNED;
|
||||
|
||||
// add zenith.signal file
|
||||
self.ar.append(
|
||||
&new_tar_header("zenith.signal", 8)?,
|
||||
&self.prev_record_lsn.0.to_le_bytes()[..],
|
||||
)?;
|
||||
|
||||
//send pg_control
|
||||
let pg_control_bytes = pg_control.encode();
|
||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||
self.ar.append(&header, &pg_control_bytes[..])?;
|
||||
|
||||
//send wal segment
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
checkpoint_segno,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
);
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
|
||||
let wal_seg = generate_wal_segment(&pg_control);
|
||||
self.ar.append(&header, &wal_seg[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
ar.finish()?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -385,7 +310,7 @@ fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
|
||||
|
||||
Ok(())
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
} else if path.strip_prefix("pg_tblspc/").is_some() {
|
||||
// TODO
|
||||
error!("tablespaces not implemented yet");
|
||||
Err(FilePathError::InvalidFileName)
|
||||
@@ -394,6 +319,28 @@ fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Check if it is relational file
|
||||
//
|
||||
fn is_rel_file_path(path: &str) -> bool {
|
||||
parse_rel_file_path(path).is_ok()
|
||||
}
|
||||
|
||||
//
|
||||
// Create new tarball entry header
|
||||
//
|
||||
fn new_tar_header(path: &str, size: u64) -> anyhow::Result<Header> {
|
||||
let mut header = Header::new_gnu();
|
||||
header.set_size(size);
|
||||
header.set_path(path)?;
|
||||
header.set_mode(0b110000000); // -rw-------
|
||||
header.set_mtime(
|
||||
// use currenttime as last modified time
|
||||
SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs(),
|
||||
);
|
||||
header.set_cksum();
|
||||
Ok(header)
|
||||
}
|
||||
|
||||
@@ -3,29 +3,116 @@
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use parse_duration::parse;
|
||||
use std::io;
|
||||
use std::process::exit;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::{env, path::PathBuf};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
env,
|
||||
fs::{File, OpenOptions},
|
||||
io,
|
||||
net::TcpListener,
|
||||
path::{Path, PathBuf},
|
||||
process::exit,
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{App, Arg};
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use slog::{Drain, FnValue};
|
||||
|
||||
use pageserver::{branches, page_cache, page_service, tui, PageServerConf};
|
||||
|
||||
const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000";
|
||||
|
||||
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
const DEFAULT_GC_PERIOD_SEC: u64 = 10;
|
||||
//const DEFAULT_GC_HORIZON: u64 = 1024 * 1024 * 1024;
|
||||
//const DEFAULT_GC_PERIOD_SEC: u64 = 600;
|
||||
const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
|
||||
|
||||
const DEFAULT_WAL_REDOERS: usize = 1;
|
||||
|
||||
/// String arguments that can be declared via CLI or config file
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct CfgFileParams {
|
||||
listen_addr: Option<String>,
|
||||
gc_horizon: Option<String>,
|
||||
gc_period: Option<String>,
|
||||
wal_redoers: Option<String>,
|
||||
pg_distrib_dir: Option<String>,
|
||||
}
|
||||
|
||||
impl CfgFileParams {
|
||||
/// Extract string arguments from CLI
|
||||
fn from_args(arg_matches: &ArgMatches) -> Self {
|
||||
let get_arg = |arg_name: &str| -> Option<String> {
|
||||
arg_matches.value_of(arg_name).map(str::to_owned)
|
||||
};
|
||||
|
||||
Self {
|
||||
listen_addr: get_arg("listen"),
|
||||
gc_horizon: get_arg("gc_horizon"),
|
||||
gc_period: get_arg("gc_period"),
|
||||
wal_redoers: get_arg("wal_redoers"),
|
||||
pg_distrib_dir: get_arg("postgres-distrib"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Fill missing values in `self` with `other`
|
||||
fn or(self, other: CfgFileParams) -> Self {
|
||||
// TODO cleaner way to do this
|
||||
Self {
|
||||
listen_addr: self.listen_addr.or(other.listen_addr),
|
||||
gc_horizon: self.gc_horizon.or(other.gc_horizon),
|
||||
gc_period: self.gc_period.or(other.gc_period),
|
||||
wal_redoers: self.wal_redoers.or(other.wal_redoers),
|
||||
pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a PageServerConf from these string parameters
|
||||
fn try_into_config(&self) -> Result<PageServerConf> {
|
||||
let listen_addr = match self.listen_addr.as_ref() {
|
||||
Some(addr) => addr.clone(),
|
||||
None => DEFAULT_LISTEN_ADDR.to_owned(),
|
||||
};
|
||||
|
||||
let gc_horizon: u64 = match self.gc_horizon.as_ref() {
|
||||
Some(horizon_str) => horizon_str.parse()?,
|
||||
None => DEFAULT_GC_HORIZON,
|
||||
};
|
||||
let gc_period = match self.gc_period.as_ref() {
|
||||
Some(period_str) => humantime::parse_duration(period_str)?,
|
||||
None => DEFAULT_GC_PERIOD,
|
||||
};
|
||||
|
||||
let wal_redoers = match self.wal_redoers.as_ref() {
|
||||
Some(wal_redoers_str) => wal_redoers_str.parse::<usize>()?,
|
||||
None => DEFAULT_WAL_REDOERS,
|
||||
};
|
||||
|
||||
let pg_distrib_dir = match self.pg_distrib_dir.as_ref() {
|
||||
Some(pg_distrib_dir_str) => PathBuf::from(pg_distrib_dir_str),
|
||||
None => env::current_dir()?.join("tmp_install"),
|
||||
};
|
||||
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
|
||||
Ok(PageServerConf {
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
materialize: false,
|
||||
|
||||
listen_addr,
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
wal_redoers,
|
||||
workdir: PathBuf::from("."),
|
||||
|
||||
pg_distrib_dir,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
@@ -44,6 +131,12 @@ fn main() -> Result<()> {
|
||||
.takes_value(false)
|
||||
.help("Interactive mode"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("materialize")
|
||||
.long("materialize")
|
||||
.takes_value(false)
|
||||
.help("Materialize pages constructed by get_page_at"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("daemonize")
|
||||
.short("d")
|
||||
@@ -69,6 +162,12 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Interval between garbage collector iterations"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("wal_redoers")
|
||||
.long("wal_redoers")
|
||||
.takes_value(true)
|
||||
.help("Number of wal-redo postgres instances"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("workdir")
|
||||
.short("D")
|
||||
@@ -76,47 +175,40 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Working directory for the pageserver"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("postgres-distrib")
|
||||
.long("postgres-distrib")
|
||||
.takes_value(true)
|
||||
.help("Postgres distribution directory"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let workdir = if let Some(workdir_arg) = arg_matches.value_of("workdir") {
|
||||
PathBuf::from(workdir_arg)
|
||||
} else if let Some(workdir_arg) = std::env::var_os("ZENITH_REPO_DIR") {
|
||||
PathBuf::from(workdir_arg.to_str().unwrap())
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
let cfg_file_path = workdir.canonicalize()?.join("pageserver.toml");
|
||||
|
||||
let args_params = CfgFileParams::from_args(&arg_matches);
|
||||
|
||||
let init = arg_matches.is_present("init");
|
||||
let params = if init {
|
||||
// We're initializing the repo, so there's no config file yet
|
||||
args_params
|
||||
} else {
|
||||
PathBuf::from(".zenith")
|
||||
// Supplement the CLI arguments with the config file
|
||||
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)?;
|
||||
let file_params: CfgFileParams = toml::from_str(&cfg_file_contents)?;
|
||||
args_params.or(file_params)
|
||||
};
|
||||
|
||||
let pg_distrib_dir: PathBuf = {
|
||||
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
||||
postgres_bin.into()
|
||||
} else {
|
||||
let cwd = env::current_dir()?;
|
||||
cwd.join("tmp_install")
|
||||
}
|
||||
};
|
||||
// Ensure the config is valid, even if just init-ing
|
||||
let mut conf = params.try_into_config()?;
|
||||
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
conf.daemonize = arg_matches.is_present("daemonize");
|
||||
conf.interactive = arg_matches.is_present("interactive");
|
||||
conf.materialize = arg_matches.is_present("materialize");
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(DEFAULT_GC_PERIOD_SEC),
|
||||
listen_addr: "127.0.0.1:64000".parse().unwrap(),
|
||||
// we will change the current working directory to the repository below,
|
||||
// so always set 'workdir' to '.'
|
||||
workdir: PathBuf::from("."),
|
||||
pg_distrib_dir,
|
||||
};
|
||||
|
||||
if arg_matches.is_present("daemonize") {
|
||||
conf.daemonize = true;
|
||||
}
|
||||
|
||||
if arg_matches.is_present("interactive") {
|
||||
conf.interactive = true;
|
||||
if init && (conf.daemonize || conf.interactive) {
|
||||
eprintln!("--daemonize and --interactive may not be used with --init");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if conf.daemonize && conf.interactive {
|
||||
@@ -124,26 +216,19 @@ fn main() -> Result<()> {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen") {
|
||||
conf.listen_addr = addr.parse()?;
|
||||
}
|
||||
|
||||
if let Some(horizon) = arg_matches.value_of("gc_horizon") {
|
||||
conf.gc_horizon = horizon.parse()?;
|
||||
}
|
||||
|
||||
if let Some(period) = arg_matches.value_of("gc_period") {
|
||||
conf.gc_period = parse(period)?;
|
||||
}
|
||||
|
||||
// The configuration is all set up now. Turn it into a 'static
|
||||
// that can be freely stored in structs and passed across threads
|
||||
// as a ref.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// Create repo and exit if init was requested
|
||||
if arg_matches.is_present("init") {
|
||||
if init {
|
||||
branches::init_repo(conf, &workdir)?;
|
||||
|
||||
// write the config file
|
||||
let cfg_file_contents = toml::to_string_pretty(¶ms)?;
|
||||
std::fs::write(&cfg_file_path, cfg_file_contents)?;
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -207,9 +292,10 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
// Check that we can bind to address before starting threads to simplify shutdown
|
||||
// sequence if port is occupied.
|
||||
info!("Starting pageserver on {}", conf.listen_addr);
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_addr)?;
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_addr.clone())?;
|
||||
|
||||
// Initialize page cache, this will spawn walredo_thread
|
||||
page_cache::init(conf);
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
//
|
||||
// Branch management code
|
||||
//
|
||||
//!
|
||||
//! Branch management code
|
||||
//!
|
||||
// TODO: move all paths construction to conf impl
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use fs::File;
|
||||
use fs_extra;
|
||||
use postgres_ffi::{pg_constants, xlog_utils};
|
||||
use postgres_ffi::{pg_constants, xlog_utils, ControlFileData};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
@@ -22,6 +20,8 @@ use std::{
|
||||
};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::restore_local_repo;
|
||||
use crate::{repository::Repository, PageServerConf, ZTimelineId};
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -39,7 +39,7 @@ pub struct PointInTime {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub fn init_repo(conf: &PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
pub fn init_repo(conf: &'static PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
fs::create_dir_all(repo_dir)
|
||||
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
|
||||
@@ -50,15 +50,9 @@ pub fn init_repo(conf: &PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
fs::create_dir(std::path::Path::new("refs"))?;
|
||||
fs::create_dir(std::path::Path::new("refs").join("branches"))?;
|
||||
fs::create_dir(std::path::Path::new("refs").join("tags"))?;
|
||||
fs::create_dir(std::path::Path::new("wal-redo"))?;
|
||||
|
||||
println!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
// Create initial timeline
|
||||
let tli = create_timeline(conf, None)?;
|
||||
let timelinedir = conf.timeline_path(tli);
|
||||
println!("created initial timeline {}", tli);
|
||||
|
||||
// Run initdb
|
||||
//
|
||||
// We create the cluster temporarily in a "tmp" directory inside the repository,
|
||||
@@ -79,17 +73,45 @@ pub fn init_repo(conf: &PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
.output()
|
||||
.with_context(|| "failed to execute initdb")?;
|
||||
if !initdb_otput.status.success() {
|
||||
anyhow::bail!("initdb failed");
|
||||
anyhow::bail!(
|
||||
"initdb failed: '{}'",
|
||||
String::from_utf8_lossy(&initdb_otput.stderr)
|
||||
);
|
||||
}
|
||||
println!("initdb succeeded");
|
||||
|
||||
// Read control file to extract the LSN and system id
|
||||
let controlfile_path = tmppath.join("global").join("pg_control");
|
||||
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
|
||||
let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
|
||||
// let systemid = controlfile.system_identifier;
|
||||
let lsn = controlfile.checkPoint;
|
||||
let lsnstr = format!("{:016X}", lsn);
|
||||
|
||||
// Bootstrap the repository by loading the newly-initdb'd cluster into 'main' branch.
|
||||
let tli = create_timeline(conf, None)?;
|
||||
let timelinedir = conf.timeline_path(tli);
|
||||
|
||||
// We don't use page_cache here, because we don't want to spawn the WAL redo thread during
|
||||
// repository initialization.
|
||||
//
|
||||
// FIXME: That caused trouble, because the WAL redo thread launched initdb in the background,
|
||||
// and it kept running even after the "zenith init" had exited. In tests, we started the
|
||||
// page server immediately after that, so that initdb was still running in the background,
|
||||
// and we failed to run initdb again in the same directory. This has been solved for the
|
||||
// rapid init+start case now, but the general race condition remains if you restart the
|
||||
// server quickly.
|
||||
let storage = crate::rocksdb_storage::RocksObjectStore::create(conf)?;
|
||||
//let storage = crate::inmem_storage::InmemObjectStore::create(conf)?;
|
||||
|
||||
let repo = crate::object_repository::ObjectRepository::new(
|
||||
conf,
|
||||
std::sync::Arc::new(storage),
|
||||
std::sync::Arc::new(crate::walredo::DummyRedoManager {}),
|
||||
);
|
||||
let timeline = repo.create_empty_timeline(tli, Lsn(lsn))?;
|
||||
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(&tmppath, &*timeline, Lsn(lsn))?;
|
||||
|
||||
// Move the initial WAL file
|
||||
fs::rename(
|
||||
tmppath.join("pg_wal").join("000000010000000000000001"),
|
||||
@@ -97,19 +119,21 @@ pub fn init_repo(conf: &PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
.join("wal")
|
||||
.join("000000010000000000000001.partial"),
|
||||
)?;
|
||||
println!("moved initial WAL file");
|
||||
println!("created initial timeline {}", tli);
|
||||
|
||||
let data = tli.to_string();
|
||||
fs::write(conf.branch_path("main"), data)?;
|
||||
println!("created main branch");
|
||||
|
||||
// Remove pg_wal
|
||||
fs::remove_dir_all(tmppath.join("pg_wal"))?;
|
||||
|
||||
// Move the data directory as an initial base backup.
|
||||
// FIXME: It would be enough to only copy the non-relation files here, the relation
|
||||
// data was already loaded into the repository.
|
||||
let target = timelinedir.join("snapshots").join(&lsnstr);
|
||||
fs::rename(tmppath, &target)?;
|
||||
|
||||
// Create 'main' branch to refer to the initial timeline
|
||||
let data = tli.to_string();
|
||||
fs::write(conf.branch_path("main"), data)?;
|
||||
println!("created main branch");
|
||||
|
||||
println!(
|
||||
"new zenith repository was created in {}",
|
||||
repo_dir.display()
|
||||
@@ -118,10 +142,9 @@ pub fn init_repo(conf: &PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_branches(
|
||||
conf: &PageServerConf,
|
||||
repository: &dyn Repository,
|
||||
) -> Result<Vec<BranchInfo>> {
|
||||
pub(crate) fn get_branches(conf: &PageServerConf) -> Result<Vec<BranchInfo>> {
|
||||
let repo = page_cache::get_repository();
|
||||
|
||||
// Each branch has a corresponding record (text file) in the refs/branches
|
||||
// with timeline_id.
|
||||
let branches_dir = std::path::Path::new("refs").join("branches");
|
||||
@@ -132,7 +155,7 @@ pub(crate) fn get_branches(
|
||||
let name = dir_entry.file_name().to_str().unwrap().to_string();
|
||||
let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
|
||||
|
||||
let latest_valid_lsn = repository
|
||||
let latest_valid_lsn = repo
|
||||
.get_timeline(timeline_id)
|
||||
.map(|timeline| timeline.get_last_valid_lsn())
|
||||
.ok();
|
||||
@@ -189,7 +212,7 @@ pub(crate) fn get_system_id(conf: &PageServerConf) -> Result<u64> {
|
||||
|
||||
let (_, main_snap_dir) = find_latest_snapshot(conf, *main_tli)?;
|
||||
let controlfile_path = main_snap_dir.join("global").join("pg_control");
|
||||
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
|
||||
let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
|
||||
Ok(controlfile.system_identifier)
|
||||
}
|
||||
|
||||
@@ -198,6 +221,8 @@ pub(crate) fn create_branch(
|
||||
branchname: &str,
|
||||
startpoint_str: &str,
|
||||
) -> Result<BranchInfo> {
|
||||
let repo = page_cache::get_repository();
|
||||
|
||||
if conf.branch_path(&branchname).exists() {
|
||||
anyhow::bail!("branch {} already exists", branchname);
|
||||
}
|
||||
@@ -206,17 +231,19 @@ pub(crate) fn create_branch(
|
||||
|
||||
if startpoint.lsn == Lsn(0) {
|
||||
// Find end of WAL on the old timeline
|
||||
let end_of_wal = find_end_of_wal(conf, startpoint.timelineid)?;
|
||||
let end_of_wal = repo
|
||||
.get_timeline(startpoint.timelineid)?
|
||||
.get_last_record_lsn();
|
||||
println!("branching at end of WAL: {}", end_of_wal);
|
||||
startpoint.lsn = end_of_wal;
|
||||
}
|
||||
|
||||
// create a new timeline for it
|
||||
// create a new timeline directory for it
|
||||
let newtli = create_timeline(conf, Some(startpoint))?;
|
||||
let newtimelinedir = conf.timeline_path(newtli);
|
||||
|
||||
let data = newtli.to_string();
|
||||
fs::write(conf.branch_path(&branchname), data)?;
|
||||
// Let the Repository backend do its initialization
|
||||
repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;
|
||||
|
||||
// Copy the latest snapshot (TODO: before the startpoint) and all WAL
|
||||
// TODO: be smarter and avoid the copying...
|
||||
@@ -232,6 +259,12 @@ pub(crate) fn create_branch(
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
)?;
|
||||
|
||||
// Remember the human-readable branch name for the new timeline.
|
||||
// FIXME: there's a race condition, if you create a branch with the same
|
||||
// name concurrently.
|
||||
let data = newtli.to_string();
|
||||
fs::write(conf.branch_path(&branchname), data)?;
|
||||
|
||||
Ok(BranchInfo {
|
||||
name: branchname.to_string(),
|
||||
timeline_id: newtli,
|
||||
@@ -340,52 +373,43 @@ fn copy_wal(src_dir: &Path, dst_dir: &Path, upto: Lsn, wal_seg_size: usize) -> R
|
||||
let last_segno = upto.segment_number(wal_seg_size);
|
||||
let last_segoff = upto.segment_offset(wal_seg_size);
|
||||
|
||||
for entry in fs::read_dir(src_dir).unwrap() {
|
||||
if let Ok(entry) = entry {
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
for entry in fs::read_dir(src_dir).unwrap().flatten() {
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
|
||||
// Check if the filename looks like an xlog file, or a .partial file.
|
||||
if !xlog_utils::IsXLogFileName(fname) && !xlog_utils::IsPartialXLogFileName(fname) {
|
||||
continue;
|
||||
}
|
||||
let (segno, _tli) = xlog_utils::XLogFromFileName(fname, wal_seg_size as usize);
|
||||
// Check if the filename looks like an xlog file, or a .partial file.
|
||||
if !xlog_utils::IsXLogFileName(fname) && !xlog_utils::IsPartialXLogFileName(fname) {
|
||||
continue;
|
||||
}
|
||||
let (segno, _tli) = xlog_utils::XLogFromFileName(fname, wal_seg_size as usize);
|
||||
|
||||
let copylen;
|
||||
let mut dst_fname = PathBuf::from(fname);
|
||||
if segno > last_segno {
|
||||
// future segment, skip
|
||||
continue;
|
||||
} else if segno < last_segno {
|
||||
copylen = wal_seg_size;
|
||||
dst_fname.set_extension("");
|
||||
} else {
|
||||
copylen = last_segoff;
|
||||
dst_fname.set_extension("partial");
|
||||
}
|
||||
let copylen;
|
||||
let mut dst_fname = PathBuf::from(fname);
|
||||
if segno > last_segno {
|
||||
// future segment, skip
|
||||
continue;
|
||||
} else if segno < last_segno {
|
||||
copylen = wal_seg_size;
|
||||
dst_fname.set_extension("");
|
||||
} else {
|
||||
copylen = last_segoff;
|
||||
dst_fname.set_extension("partial");
|
||||
}
|
||||
|
||||
let src_file = File::open(entry.path())?;
|
||||
let mut dst_file = File::create(dst_dir.join(&dst_fname))?;
|
||||
std::io::copy(&mut src_file.take(copylen as u64), &mut dst_file)?;
|
||||
let src_file = File::open(entry.path())?;
|
||||
let mut dst_file = File::create(dst_dir.join(&dst_fname))?;
|
||||
std::io::copy(&mut src_file.take(copylen as u64), &mut dst_file)?;
|
||||
|
||||
if copylen < wal_seg_size {
|
||||
std::io::copy(
|
||||
&mut std::io::repeat(0).take((wal_seg_size - copylen) as u64),
|
||||
&mut dst_file,
|
||||
)?;
|
||||
}
|
||||
if copylen < wal_seg_size {
|
||||
std::io::copy(
|
||||
&mut std::io::repeat(0).take((wal_seg_size - copylen) as u64),
|
||||
&mut dst_file,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Find the end of valid WAL in a wal directory
|
||||
pub fn find_end_of_wal(conf: &PageServerConf, timeline: ZTimelineId) -> Result<Lsn> {
|
||||
let waldir = conf.timeline_path(timeline).join("wal");
|
||||
let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, pg_constants::WAL_SEGMENT_SIZE, true);
|
||||
Ok(Lsn(lsn))
|
||||
}
|
||||
|
||||
// Find the latest snapshot for a timeline
|
||||
fn find_latest_snapshot(conf: &PageServerConf, timeline: ZTimelineId) -> Result<(Lsn, PathBuf)> {
|
||||
let snapshotsdir = conf.snapshots_path(timeline);
|
||||
|
||||
345
pageserver/src/inmem_storage.rs
Normal file
345
pageserver/src/inmem_storage.rs
Normal file
@@ -0,0 +1,345 @@
|
||||
//!
|
||||
//! An implementation of the ObjectStore interface, backed by BTreeMap
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::object_store::ObjectStore;
|
||||
use crate::repository::RelTag;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::ops::Bound::*;
|
||||
use std::sync::RwLock;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize)]
|
||||
pub struct StorageKey {
|
||||
obj_key: ObjectKey,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl StorageKey {
|
||||
/// The first key for a given timeline
|
||||
fn timeline_start(timeline: ZTimelineId) -> Self {
|
||||
Self {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
tag: ObjectTag::FirstTag,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct InmemObjectStore {
|
||||
conf: &'static PageServerConf,
|
||||
db: RwLock<BTreeMap<StorageKey, Vec<u8>>>,
|
||||
}
|
||||
|
||||
impl ObjectStore for InmemObjectStore {
|
||||
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
|
||||
let db = self.db.read().unwrap();
|
||||
let val = db.get(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
});
|
||||
if let Some(val) = val {
|
||||
Ok(val.clone())
|
||||
} else {
|
||||
bail!("could not find page {:?}", key);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
|
||||
let search_key = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let db = self.db.read().unwrap();
|
||||
for pair in db.range(&search_key..) {
|
||||
let key = pair.0;
|
||||
return Ok(Some(key.obj_key.clone()));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
|
||||
let mut db = self.db.write().unwrap();
|
||||
db.insert(
|
||||
StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
},
|
||||
value.to_vec(),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
|
||||
let mut db = self.db.write().unwrap();
|
||||
db.remove(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Iterate through page versions of given page, starting from the given LSN.
|
||||
/// The versions are walked in descending LSN order.
|
||||
fn object_versions<'a>(
|
||||
&'a self,
|
||||
key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
|
||||
let from = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let till = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
};
|
||||
let db = self.db.read().unwrap();
|
||||
let versions: Vec<(Lsn, Vec<u8>)> = db
|
||||
.range(from..=till)
|
||||
.map(|pair| (pair.0.lsn, pair.1.clone()))
|
||||
.collect();
|
||||
Ok(Box::new(InmemObjectVersionIter::new(versions)))
|
||||
}
|
||||
|
||||
/// Iterate through all timeline objects
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
|
||||
let curr_key = StorageKey::timeline_start(timeline);
|
||||
|
||||
Ok(Box::new(InmemObjectIter {
|
||||
store: &self,
|
||||
curr_key,
|
||||
timeline,
|
||||
nonrel_only,
|
||||
lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
///
|
||||
/// TODO: This implementation is very inefficient, it scans
|
||||
/// through all entries in the given database. In practice, this
|
||||
/// is used for CREATE DATABASE, and usually the template database is small.
|
||||
/// But if it's not, this will be slow.
|
||||
fn list_rels(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<RelTag>> {
|
||||
// FIXME: This scans everything. Very slow
|
||||
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
|
||||
let mut search_rel_tag = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: 0u8,
|
||||
};
|
||||
let db = self.db.read().unwrap();
|
||||
'outer: loop {
|
||||
let search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::RelationMetadata(search_rel_tag),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
for pair in db.range(&search_key..) {
|
||||
let key = pair.0;
|
||||
|
||||
if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
|
||||
if spcnode != 0 && rel_tag.spcnode != spcnode
|
||||
|| dbnode != 0 && rel_tag.dbnode != dbnode
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
if key.lsn <= lsn {
|
||||
// visible in this snapshot
|
||||
rels.insert(rel_tag);
|
||||
}
|
||||
search_rel_tag = rel_tag;
|
||||
// skip to next relation
|
||||
// FIXME: What if relnode is u32::MAX ?
|
||||
search_rel_tag.relnode += 1;
|
||||
continue 'outer;
|
||||
} else {
|
||||
// no more relation metadata entries
|
||||
break 'outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Iterate through versions of all objects in a timeline.
|
||||
///
|
||||
/// Returns objects in increasing key-version order.
|
||||
/// Returns all versions up to and including the specified LSN.
|
||||
fn objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
|
||||
let curr_key = StorageKey::timeline_start(timeline);
|
||||
|
||||
Ok(Box::new(InmemObjects {
|
||||
store: &self,
|
||||
curr_key,
|
||||
timeline,
|
||||
lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
fn compact(&self) {}
|
||||
}
|
||||
|
||||
impl Drop for InmemObjectStore {
|
||||
fn drop(&mut self) {
|
||||
let path = self.conf.workdir.join("objstore.dmp");
|
||||
let mut f = File::create(path).unwrap();
|
||||
f.write(&self.db.ser().unwrap()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl InmemObjectStore {
|
||||
pub fn open(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
|
||||
let path = conf.workdir.join("objstore.dmp");
|
||||
let mut f = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
f.read_to_end(&mut buffer)?;
|
||||
let db = RwLock::new(BTreeMap::des(&buffer)?);
|
||||
Ok(InmemObjectStore { conf: conf, db })
|
||||
}
|
||||
|
||||
pub fn create(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
|
||||
Ok(InmemObjectStore {
|
||||
conf: conf,
|
||||
db: RwLock::new(BTreeMap::new()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `object_versions`. Returns all page versions of a given block, in
|
||||
/// reverse LSN order.
|
||||
///
|
||||
struct InmemObjectVersionIter {
|
||||
versions: Vec<(Lsn, Vec<u8>)>,
|
||||
curr: usize,
|
||||
}
|
||||
impl InmemObjectVersionIter {
|
||||
fn new(versions: Vec<(Lsn, Vec<u8>)>) -> InmemObjectVersionIter {
|
||||
let curr = versions.len();
|
||||
InmemObjectVersionIter { versions, curr }
|
||||
}
|
||||
}
|
||||
impl Iterator for InmemObjectVersionIter {
|
||||
type Item = (Lsn, Vec<u8>);
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
if self.curr == 0 {
|
||||
None
|
||||
} else {
|
||||
self.curr -= 1;
|
||||
Some(self.versions[self.curr].clone())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct InmemObjects<'r> {
|
||||
store: &'r InmemObjectStore,
|
||||
curr_key: StorageKey,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl<'r> Iterator for InmemObjects<'r> {
|
||||
// TODO consider returning Box<[u8]>
|
||||
type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_result().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r> InmemObjects<'r> {
|
||||
fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
|
||||
let db = self.store.db.read().unwrap();
|
||||
for pair in db.range((Excluded(&self.curr_key), Unbounded)) {
|
||||
let key = pair.0;
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
return Ok(None);
|
||||
}
|
||||
if key.lsn > self.lsn {
|
||||
// TODO can speed up by seeking iterator
|
||||
continue;
|
||||
}
|
||||
self.curr_key = key.clone();
|
||||
let value = pair.1.clone();
|
||||
return Ok(Some((key.obj_key.tag, key.lsn, value)));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
|
||||
///
|
||||
struct InmemObjectIter<'a> {
|
||||
store: &'a InmemObjectStore,
|
||||
curr_key: StorageKey,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for InmemObjectIter<'a> {
|
||||
type Item = ObjectTag;
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
let db = self.store.db.read().unwrap();
|
||||
'outer: loop {
|
||||
for pair in db.range((Excluded(&self.curr_key), Unbounded)) {
|
||||
let key = pair.0;
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
return None;
|
||||
}
|
||||
self.curr_key = key.clone();
|
||||
self.curr_key.lsn = Lsn(u64::MAX); // next seek should skip all versions
|
||||
if key.lsn <= self.lsn {
|
||||
// visible in this snapshot
|
||||
if self.nonrel_only {
|
||||
match key.obj_key.tag {
|
||||
ObjectTag::RelationMetadata(_) => return None,
|
||||
ObjectTag::RelationBuffer(_) => return None,
|
||||
_ => return Some(key.obj_key.tag),
|
||||
}
|
||||
} else {
|
||||
return Some(key.obj_key.tag);
|
||||
}
|
||||
}
|
||||
continue 'outer;
|
||||
}
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,17 +1,21 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use std::fmt;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod basebackup;
|
||||
pub mod branches;
|
||||
pub mod inmem_storage;
|
||||
pub mod object_key;
|
||||
pub mod object_repository;
|
||||
pub mod object_store;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod rocksdb_storage;
|
||||
pub mod tui;
|
||||
pub mod tui_event;
|
||||
mod tui_logger;
|
||||
@@ -23,9 +27,11 @@ pub mod walredo;
|
||||
pub struct PageServerConf {
|
||||
pub daemonize: bool,
|
||||
pub interactive: bool,
|
||||
pub listen_addr: SocketAddr,
|
||||
pub materialize: bool,
|
||||
pub listen_addr: String,
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
pub wal_redoers: usize,
|
||||
|
||||
// Repository directory, relative to current working directory.
|
||||
// Normally, the page server changes the current working directory
|
||||
@@ -100,7 +106,7 @@ impl PageServerConf {
|
||||
/// is separate from PostgreSQL timelines, and doesn't have those
|
||||
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
||||
/// is usually printed out as a hex string.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
pub struct ZTimelineId([u8; 16]);
|
||||
|
||||
impl FromStr for ZTimelineId {
|
||||
|
||||
84
pageserver/src/object_key.rs
Normal file
84
pageserver/src/object_key.rs
Normal file
@@ -0,0 +1,84 @@
|
||||
use crate::repository::{BufferTag, RelTag};
|
||||
use crate::waldecoder::TransactionId;
|
||||
use crate::ZTimelineId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
///
|
||||
/// ObjectKey is the key type used to identify objects stored in an object
|
||||
/// repository. It is shared between object_repository.rs and object_store.rs.
|
||||
/// It is mostly opaque to ObjectStore, it just stores and retrieves objects
|
||||
/// using the key given by the caller.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct ObjectKey {
|
||||
pub timeline: ZTimelineId,
|
||||
pub tag: ObjectTag,
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and pg_multixact)
|
||||
/// in Postgres are handled by SLRU (Simple LRU) buffer, hence the name.
|
||||
///
|
||||
/// These files are global for a postgres instance.
|
||||
///
|
||||
/// These files are divided into segments, which are divided into pages
|
||||
/// of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct SlruBufferTag {
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Special type of Postgres files: pg_filenode.map is needed to map
|
||||
/// catalog table OIDs to filenode numbers, which define filename.
|
||||
///
|
||||
/// Each database has a map file for its local mapped catalogs,
|
||||
/// and there is a separate map file for shared catalogs.
|
||||
///
|
||||
/// These files have untypical size of 512 bytes.
|
||||
///
|
||||
/// See PostgreSQL relmapper.c for details.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct DatabaseTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation files that keep state for prepared transactions.
|
||||
/// Unlike other files these are not divided into pages.
|
||||
///
|
||||
/// See PostgreSQL twophase.c for details.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct PrepareTag {
|
||||
pub xid: TransactionId,
|
||||
}
|
||||
|
||||
/// ObjectTag is a part of ObjectKey that is specific to the type of
|
||||
/// the stored object.
|
||||
///
|
||||
/// NB: the order of the enum values is significant! In particular,
|
||||
/// rocksdb_storage.rs assumes that TimelineMetadataTag is first
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum ObjectTag {
|
||||
// dummy tag preceeding all other keys
|
||||
FirstTag,
|
||||
TimelineMetadataTag,
|
||||
// Special entry that represents PostgreSQL checkpoint.
|
||||
// We use it to track fields needed to restore controlfile checkpoint.
|
||||
Checkpoint,
|
||||
// Various types of non-relation files.
|
||||
// We need them to bootstrap compute node.
|
||||
ControlFile,
|
||||
Clog(SlruBufferTag),
|
||||
MultiXactMembers(SlruBufferTag),
|
||||
MultiXactOffsets(SlruBufferTag),
|
||||
FileNodeMap(DatabaseTag),
|
||||
TwoPhase(PrepareTag),
|
||||
// put relations at the end of enum to allow efficient iterations through non-rel objects
|
||||
RelationMetadata(RelTag),
|
||||
RelationBuffer(BufferTag),
|
||||
}
|
||||
1274
pageserver/src/object_repository.rs
Normal file
1274
pageserver/src/object_repository.rs
Normal file
File diff suppressed because it is too large
Load Diff
88
pageserver/src/object_store.rs
Normal file
88
pageserver/src/object_store.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
//! Low-level key-value storage abstraction.
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::repository::RelTag;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashSet;
|
||||
use std::iter::Iterator;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Low-level storage abstraction.
|
||||
///
|
||||
/// All the data in the repository is stored in a key-value store. This trait
|
||||
/// abstracts the details of the key-value store.
|
||||
///
|
||||
/// A simple key-value store would support just GET and PUT operations with
|
||||
/// a key, but the upper layer needs slightly complicated read operations
|
||||
///
|
||||
/// The most frequently used function is 'object_versions'. It is used
|
||||
/// to look up a page version. It is LSN aware, in that the caller
|
||||
/// specifies an LSN, and the function returns all values for that
|
||||
/// block with the same or older LSN.
|
||||
///
|
||||
pub trait ObjectStore: Send + Sync {
|
||||
///
|
||||
/// Store a value with given key.
|
||||
///
|
||||
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()>;
|
||||
|
||||
/// Read entry with the exact given key.
|
||||
///
|
||||
/// This is used for retrieving metadata with special key that doesn't
|
||||
/// correspond to any real relation.
|
||||
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>>;
|
||||
|
||||
/// Read key greater or equal than specified
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>>;
|
||||
|
||||
/// Iterate through all page versions of one object.
|
||||
///
|
||||
/// Returns all page versions in descending LSN order, along with the LSN
|
||||
/// of each page version.
|
||||
fn object_versions<'a>(
|
||||
&'a self,
|
||||
key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>>;
|
||||
|
||||
/// Iterate through versions of all objects in a timeline.
|
||||
///
|
||||
/// Returns objects in increasing key-version order.
|
||||
/// Returns all versions up to and including the specified LSN.
|
||||
fn objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>>;
|
||||
|
||||
/// Iterate through all keys with given tablespace and database ID, and LSN <= 'lsn'.
|
||||
/// Both dbnode and spcnode can be InvalidId (0) which means get all relations in tablespace/cluster
|
||||
///
|
||||
/// This is used to implement 'create database'
|
||||
fn list_rels(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<RelTag>>;
|
||||
|
||||
/// Iterate through objects tags. If nonrel_only, then only non-relationa data is iterated.
|
||||
///
|
||||
/// This is used to implement GC and preparing tarball for new node startup
|
||||
/// Returns objects in increasing key-version order.
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timelineid: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>>;
|
||||
|
||||
/// Unlink object (used by GC). This mehod may actually delete object or just mark it for deletion.
|
||||
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()>;
|
||||
|
||||
// Compact storage and remove versions marged for deletion
|
||||
fn compact(&self);
|
||||
}
|
||||
@@ -3,30 +3,35 @@
|
||||
//! isn't much here. If we implement multi-tenancy, this will probably be changed into
|
||||
//! a hash map, keyed by the tenant ID.
|
||||
|
||||
use crate::repository::rocksdb::RocksRepository;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::repository::Repository;
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
//use crate::inmem_storage::InmemObjectStore;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use lazy_static::lazy_static;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
lazy_static! {
|
||||
pub static ref REPOSITORY: Mutex<Option<Arc<dyn Repository + Send + Sync>>> = Mutex::new(None);
|
||||
pub static ref REPOSITORY: Mutex<Option<Arc<dyn Repository>>> = Mutex::new(None);
|
||||
}
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
let obj_store = RocksObjectStore::open(conf).unwrap();
|
||||
//let obj_store = InmemObjectStore::open(conf).unwrap();
|
||||
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf);
|
||||
|
||||
// we have already changed current dir to the repository.
|
||||
let repo = RocksRepository::new(conf, Arc::new(walredo_mgr));
|
||||
let repo = ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr));
|
||||
|
||||
*m = Some(Arc::new(repo));
|
||||
}
|
||||
|
||||
pub fn get_repository() -> Arc<dyn Repository + Send + Sync> {
|
||||
pub fn get_repository() -> Arc<dyn Repository> {
|
||||
let o = &REPOSITORY.lock().unwrap();
|
||||
Arc::clone(o.as_ref().unwrap())
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,55 +1,77 @@
|
||||
pub mod rocksdb;
|
||||
|
||||
use crate::waldecoder::{DecodedWALRecord, Oid, TransactionId, XlCreateDatabase, XlSmgrTruncate};
|
||||
use crate::object_key::*;
|
||||
use crate::waldecoder::TransactionId;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_ffi::nonrelfile_utils::transaction_id_get_status;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::iter::Iterator;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// A repository corresponds to one .zenith directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
pub trait Repository {
|
||||
pub trait Repository: Send + Sync {
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
///
|
||||
/// The Timeline is expected to be already "open", i.e. `get_or_restore_timeline`
|
||||
/// should've been called on it earlier already.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
///
|
||||
/// Creates a new Timeline object if it's not "open" already.
|
||||
fn get_or_restore_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
||||
fn create_empty_timeline(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
start_lsn: Lsn,
|
||||
) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Create an empty timeline, without loading any data into it from possible on-disk snapshot.
|
||||
///
|
||||
/// For unit tests.
|
||||
#[cfg(test)]
|
||||
fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
||||
|
||||
//fn get_stats(&self) -> RepositoryStats;
|
||||
}
|
||||
|
||||
pub trait Timeline {
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct GcResult {
|
||||
pub n_relations: u64,
|
||||
pub inspected: u64,
|
||||
pub truncated: u64,
|
||||
pub deleted: u64,
|
||||
pub prep_deleted: u64, // 2PC prepare
|
||||
pub slru_deleted: u64, // SLRU (clog, multixact)
|
||||
pub chkp_deleted: u64, // Checkpoints
|
||||
pub dropped: u64,
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
pub trait Timeline: Send + Sync {
|
||||
//------------------------------------------------------------------------------
|
||||
// Public GET functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn(&self, tag: BufferTag, lsn: Lsn) -> Result<Bytes>;
|
||||
fn get_page_at_lsn(&self, tag: ObjectTag, lsn: Lsn) -> Result<Bytes>;
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn_nowait(&self, tag: ObjectTag, lsn: Lsn, materialize: bool) -> Result<Bytes>;
|
||||
|
||||
/// Get size of relation
|
||||
fn get_relsize(&self, tag: RelTag, lsn: Lsn) -> Result<u32>;
|
||||
fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<u32>;
|
||||
|
||||
/// Does relation exist?
|
||||
fn get_relsize_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
|
||||
fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
|
||||
|
||||
/// Get page image at the particular LSN
|
||||
fn get_page_image(&self, tag: BufferTag, lsn: Lsn) -> Result<Option<Bytes>>;
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;
|
||||
|
||||
/// Get a list of non-relational objects
|
||||
fn list_nonrels<'a>(&'a self, lsn: Lsn) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>>;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
@@ -61,94 +83,26 @@ pub trait Timeline {
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, tag: BufferTag, rec: WALRecord);
|
||||
fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()>;
|
||||
|
||||
/// Put raw data
|
||||
fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes);
|
||||
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes, update_meta: bool)
|
||||
-> Result<()>;
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// Create a new database from a template database
|
||||
///
|
||||
/// In PostgreSQL, CREATE DATABASE works by scanning the data directory and
|
||||
/// copying all relation files from the template database. This is the equivalent
|
||||
/// of that.
|
||||
fn put_create_database(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
db_id: Oid,
|
||||
tablespace_id: Oid,
|
||||
src_db_id: Oid,
|
||||
src_tablespace_id: Oid,
|
||||
) -> Result<()>;
|
||||
/// Unlink relation. This method is used for marking dropped relations.
|
||||
fn put_unlink(&self, tag: RelTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
///
|
||||
/// Helper function to parse a WAL record and call the above functions for all the
|
||||
/// relations/pages that the record affects.
|
||||
///
|
||||
fn save_decoded_record(
|
||||
&self,
|
||||
decoded: DecodedWALRecord,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
// Figure out which blocks the record applies to, and "put" a separate copy
|
||||
// of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
/// Truncate SLRU segment
|
||||
fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
self.put_wal_record(tag, rec);
|
||||
}
|
||||
|
||||
// Handle a few special record types
|
||||
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_SMGR_TRUNCATE
|
||||
{
|
||||
let truncate = XlSmgrTruncate::decode(&decoded);
|
||||
if (truncate.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 {
|
||||
let rel = RelTag {
|
||||
spcnode: truncate.rnode.spcnode,
|
||||
dbnode: truncate.rnode.dbnode,
|
||||
relnode: truncate.rnode.relnode,
|
||||
forknum: pg_constants::MAIN_FORKNUM,
|
||||
};
|
||||
self.put_truncation(rel, lsn, truncate.blkno)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_DBASE_CREATE
|
||||
{
|
||||
let createdb = XlCreateDatabase::decode(&decoded);
|
||||
self.put_create_database(
|
||||
lsn,
|
||||
createdb.db_id,
|
||||
createdb.tablespace_id,
|
||||
createdb.src_db_id,
|
||||
createdb.src_tablespace_id,
|
||||
)?;
|
||||
}
|
||||
// Now that this record has been handled, let the repository know that
|
||||
// it is up-to-date to this LSN
|
||||
self.advance_last_record_lsn(lsn);
|
||||
Ok(())
|
||||
}
|
||||
// Get object tag greater or equal than specified
|
||||
fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>>;
|
||||
|
||||
/// Remember the all WAL before the given LSN has been processed.
|
||||
///
|
||||
@@ -167,15 +121,66 @@ pub trait Timeline {
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn);
|
||||
fn get_last_record_lsn(&self) -> Lsn;
|
||||
|
||||
/// Get range [begin,end) of stored blocks. Used mostly for SMGR pseudorelations
|
||||
/// but can be also applied to normal relations.
|
||||
fn get_range(&self, rel: RelTag, lsn: Lsn) -> Result<(u32, u32)>;
|
||||
// Like `advance_last_record_lsn`, but points to the start position of last record
|
||||
fn get_prev_record_lsn(&self) -> Lsn;
|
||||
|
||||
/// Get vector of databases (represented using RelTag only dbnode and spcnode fields are used)
|
||||
fn get_databases(&self, lsn: Lsn) -> Result<Vec<RelTag>>;
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self) -> Result<()>;
|
||||
|
||||
/// Get vector of prepared twophase transactions
|
||||
fn get_twophase(&self, lsn: Lsn) -> Result<Vec<TransactionId>>;
|
||||
/// Events for all relations in the timeline.
|
||||
/// Contains updates from start up to the last valid LSN
|
||||
/// at time of history() call. This lsn can be read via the lsn() function.
|
||||
///
|
||||
/// Relation size is increased implicitly and decreased with Truncate updates.
|
||||
// TODO ordering guarantee?
|
||||
fn history<'a>(&'a self) -> Result<Box<dyn History + 'a>>;
|
||||
|
||||
/// Perform one garbage collection iteration.
|
||||
/// Garbage collection is periodically performed by GC thread,
|
||||
/// but it can be explicitly requested through page server API.
|
||||
///
|
||||
/// `horizon` specifies delta from last LSN to preserve all object versions (PITR interval).
|
||||
/// `compact` parameter is used to force compaction of storage.
|
||||
/// Some storage implementation are based on LSM tree and require periodic merge (compaction).
|
||||
/// Usually storage implementation determines itself when compaction should be performed.
|
||||
/// But for GC tests it way be useful to force compaction just after completion of GC iteration
|
||||
/// to make sure that all detected garbage is removed.
|
||||
/// So right now `compact` is set to true when GC explicitly requested through page srver API,
|
||||
/// and is st to false in GC threads which infinitely repeats GC iterations in loop.
|
||||
fn gc_iteration(&self, horizon: u64, compact: bool) -> Result<GcResult>;
|
||||
|
||||
// Check transaction status
|
||||
fn get_tx_status(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<u8> {
|
||||
let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
let clog_page = self.get_page_at_lsn(tag, lsn)?;
|
||||
let status = transaction_id_get_status(xid, &clog_page[..]);
|
||||
Ok(status)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait History: Iterator<Item = Result<RelationUpdate>> {
|
||||
/// The last_valid_lsn at the time of history() call.
|
||||
fn lsn(&self) -> Lsn;
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct RelationUpdate {
|
||||
pub rel: RelTag,
|
||||
pub lsn: Lsn,
|
||||
pub update: Update,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum Update {
|
||||
Page { blknum: u32, img: Bytes },
|
||||
WALRecord { blknum: u32, rec: WALRecord },
|
||||
Truncate { n_blocks: u32 },
|
||||
Unlink,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -186,7 +191,22 @@ pub struct RepositoryStats {
|
||||
pub num_getpage_requests: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy)]
|
||||
///
|
||||
/// Relation data file segment id throughout the Postgres cluster.
|
||||
///
|
||||
/// Every data file in Postgres is uniquely identified by 4 numbers:
|
||||
/// - relation id / node (`relnode`)
|
||||
/// - database id (`dbnode`)
|
||||
/// - tablespace id (`spcnode`), in short this is a unique id of a separate
|
||||
/// directory to store data files.
|
||||
/// - forknumber (`forknum`) is used to split different kinds of data of the same relation
|
||||
/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`).
|
||||
///
|
||||
/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value
|
||||
/// are used for the same purpose.
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: u32,
|
||||
@@ -195,20 +215,12 @@ pub struct RelTag {
|
||||
}
|
||||
|
||||
impl RelTag {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u8(self.forknum);
|
||||
buf.put_u32(self.spcnode);
|
||||
buf.put_u32(self.dbnode);
|
||||
buf.put_u32(self.relnode);
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> RelTag {
|
||||
RelTag {
|
||||
forknum: buf.get_u8(),
|
||||
spcnode: buf.get_u32(),
|
||||
dbnode: buf.get_u32(),
|
||||
relnode: buf.get_u32(),
|
||||
}
|
||||
}
|
||||
pub const ZEROED: Self = Self {
|
||||
forknum: 0,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
};
|
||||
}
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
@@ -229,38 +241,27 @@ impl fmt::Display for RelTag {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
///
|
||||
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
|
||||
/// This is used as a part of the key inside key-value storage (RocksDB currently).
|
||||
///
|
||||
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
|
||||
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
impl BufferTag {
|
||||
pub fn fork(forknum: u8) -> BufferTag {
|
||||
BufferTag {
|
||||
rel: RelTag {
|
||||
forknum,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
},
|
||||
blknum: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
self.rel.pack(buf);
|
||||
buf.put_u32(self.blknum);
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> BufferTag {
|
||||
BufferTag {
|
||||
rel: RelTag::unpack(buf),
|
||||
blknum: buf.get_u32(),
|
||||
}
|
||||
}
|
||||
pub const ZEROED: Self = Self {
|
||||
rel: RelTag::ZEROED,
|
||||
blknum: 0,
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
@@ -300,6 +301,8 @@ impl WALRecord {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use postgres_ffi::pg_constants;
|
||||
@@ -315,15 +318,21 @@ mod tests {
|
||||
relnode: 1000,
|
||||
forknum: 0,
|
||||
};
|
||||
const TESTREL_B: RelTag = RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1001,
|
||||
forknum: 0,
|
||||
};
|
||||
|
||||
/// Convenience function to create a BufferTag for testing.
|
||||
/// Helps to keeps the tests shorter.
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_BUF(blknum: u32) -> BufferTag {
|
||||
BufferTag {
|
||||
fn TEST_BUF(blknum: u32) -> ObjectTag {
|
||||
ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: TESTREL_A,
|
||||
blknum,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
@@ -344,31 +353,28 @@ mod tests {
|
||||
let conf = PageServerConf {
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
materialize: false,
|
||||
gc_horizon: 64 * 1024 * 1024,
|
||||
gc_period: Duration::from_secs(10),
|
||||
listen_addr: "127.0.0.1:5430".parse().unwrap(),
|
||||
workdir: repo_dir.into(),
|
||||
wal_redoers: 1,
|
||||
listen_addr: "127.0.0.1:5430".to_string(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: "".into(),
|
||||
};
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let obj_store = RocksObjectStore::create(conf)?;
|
||||
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
|
||||
let repo = rocksdb::RocksRepository::new(conf, Arc::new(walredo_mgr));
|
||||
let repo = ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr));
|
||||
|
||||
Ok(Box::new(repo))
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation.
|
||||
///
|
||||
/// FIXME: The RocksRepository implementation returns wrong relation size, if
|
||||
/// you make a request with an old LSN. It seems to ignore the requested LSN
|
||||
/// and always return result as of latest LSN. For such cases, the expected
|
||||
/// results below match the current RocksRepository behavior, so that the test
|
||||
/// passes, and the actually correct answers are in comments like
|
||||
/// "// CORRECT: <correct answer>"
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
@@ -377,25 +383,24 @@ mod tests {
|
||||
// Create timeline to work on
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"));
|
||||
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"));
|
||||
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
|
||||
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"), true)?;
|
||||
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"), true)?;
|
||||
|
||||
tline.advance_last_valid_lsn(Lsn(5));
|
||||
|
||||
// rocksdb implementation erroneosly returns 'true' here
|
||||
assert_eq!(tline.get_relsize_exists(TESTREL_A, Lsn(1))?, true); // CORRECT: false
|
||||
// likewise, it returns wrong size here
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(1))?, 3); // CORRECT: 0 (or error?)
|
||||
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(1))?, false);
|
||||
assert!(tline.get_rel_size(TESTREL_A, Lsn(1)).is_err());
|
||||
|
||||
assert_eq!(tline.get_relsize_exists(TESTREL_A, Lsn(2))?, true);
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(2))?, 3); // CORRECT: 1
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(5))?, 3);
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(2))?, true);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(2))?, 1);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(5))?, 3);
|
||||
|
||||
// Check page contents at each LSN
|
||||
assert_eq!(
|
||||
@@ -435,7 +440,7 @@ mod tests {
|
||||
tline.advance_last_valid_lsn(Lsn(6));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(6))?, 2);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(6))?, 2);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(6))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
@@ -446,7 +451,7 @@ mod tests {
|
||||
);
|
||||
|
||||
// should still see the truncated block with older LSN
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(5))?, 2); // CORRECT: 3
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(5))?, 3);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
@@ -464,7 +469,7 @@ mod tests {
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
|
||||
@@ -472,12 +477,12 @@ mod tests {
|
||||
for i in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
|
||||
lsn += 1;
|
||||
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img);
|
||||
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img, true)?;
|
||||
}
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
|
||||
assert_eq!(
|
||||
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE + 1
|
||||
);
|
||||
|
||||
@@ -486,7 +491,7 @@ mod tests {
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE
|
||||
);
|
||||
|
||||
@@ -495,28 +500,134 @@ mod tests {
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE - 1
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Test branch creation
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = get_test_repo("test_branch")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(4), TEST_IMG("foo blk 0 at 4"), true)?;
|
||||
|
||||
// Create another relation
|
||||
let buftag2 = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: TESTREL_B,
|
||||
blknum: 0,
|
||||
});
|
||||
tline.put_page_image(buftag2, Lsn(2), TEST_IMG("foobar blk 0 at 2"), true)?;
|
||||
|
||||
tline.advance_last_valid_lsn(Lsn(4));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(3))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
newtline.put_page_image(TEST_BUF(0), Lsn(4), TEST_IMG("bar blk 0 at 4"), true)?;
|
||||
newtline.advance_last_valid_lsn(Lsn(4));
|
||||
|
||||
// Check page contents on both branches
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
|
||||
TEST_IMG("foo blk 0 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
newtline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
|
||||
TEST_IMG("bar blk 0 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
newtline.get_page_at_lsn(buftag2, Lsn(4))?,
|
||||
TEST_IMG("foobar blk 0 at 2")
|
||||
);
|
||||
|
||||
assert_eq!(newtline.get_rel_size(TESTREL_B, Lsn(4))?, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_history() -> Result<()> {
|
||||
let repo = get_test_repo("test_snapshot")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
let mut snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(0));
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
// add a page and advance the last valid LSN
|
||||
let rel = TESTREL_A;
|
||||
let tag = TEST_BUF(1);
|
||||
tline.put_page_image(tag, Lsn(1), TEST_IMG("blk 1 @ lsn 1"), true)?;
|
||||
tline.advance_last_valid_lsn(Lsn(1));
|
||||
let mut snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(1));
|
||||
let expected_page = RelationUpdate {
|
||||
rel: rel,
|
||||
lsn: Lsn(1),
|
||||
update: Update::Page {
|
||||
blknum: 1,
|
||||
img: TEST_IMG("blk 1 @ lsn 1"),
|
||||
},
|
||||
};
|
||||
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
// truncate to zero, but don't advance the last valid LSN
|
||||
tline.put_truncation(rel, Lsn(2), 0)?;
|
||||
let mut snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(1));
|
||||
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
// advance the last valid LSN and the truncation should be observable
|
||||
tline.advance_last_valid_lsn(Lsn(2));
|
||||
let mut snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(2));
|
||||
|
||||
// TODO ordering not guaranteed by API. But currently it returns the
|
||||
// truncation entry before the block data.
|
||||
let expected_truncate = RelationUpdate {
|
||||
rel: rel,
|
||||
lsn: Lsn(2),
|
||||
update: Update::Truncate { n_blocks: 0 },
|
||||
};
|
||||
assert_eq!(Some(expected_truncate), snapshot.next().transpose()?);
|
||||
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager {}
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
tag: ObjectTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for rel {} blk {} to get to {}, with {} and {} records",
|
||||
tag.rel,
|
||||
tag.blknum,
|
||||
"redo for {:?} to get to {}, with {} and {} records",
|
||||
tag,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
|
||||
@@ -1,978 +0,0 @@
|
||||
//
|
||||
// A Repository holds all the different page versions and WAL records
|
||||
//
|
||||
// This implementation uses RocksDB to store WAL wal records and
|
||||
// full page images, keyed by the RelFileNode, blocknumber, and the
|
||||
// LSN.
|
||||
|
||||
use crate::repository::{BufferTag, RelTag, Repository, Timeline, WALRecord};
|
||||
use crate::restore_local_repo::restore_timeline;
|
||||
use crate::waldecoder::{Oid, TransactionId};
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
// use crate::PageServerConf;
|
||||
// use crate::branches;
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use postgres_ffi::nonrelfile_utils::transaction_id_get_status;
|
||||
use postgres_ffi::*;
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::thread;
|
||||
use std::time::{Duration, Instant};
|
||||
use zenith_utils::lsn::{AtomicLsn, Lsn};
|
||||
use zenith_utils::seqwait::SeqWait;
|
||||
|
||||
// Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
static TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
pub struct RocksRepository {
|
||||
conf: &'static PageServerConf,
|
||||
timelines: Mutex<HashMap<ZTimelineId, Arc<RocksTimeline>>>,
|
||||
|
||||
walredo_mgr: Arc<dyn WalRedoManager>,
|
||||
}
|
||||
|
||||
pub struct RocksTimeline {
|
||||
// RocksDB handle
|
||||
db: rocksdb::DB,
|
||||
|
||||
// WAL redo manager
|
||||
walredo_mgr: Arc<dyn WalRedoManager>,
|
||||
|
||||
// What page versions do we hold in the cache? If we get a request > last_valid_lsn,
|
||||
// we need to wait until we receive all the WAL up to the request. The SeqWait
|
||||
// provides functions for that. TODO: If we get a request for an old LSN, such that
|
||||
// the versions have already been garbage collected away, we should throw an error,
|
||||
// but we don't track that currently.
|
||||
//
|
||||
// last_record_lsn points to the end of last processed WAL record.
|
||||
// It can lag behind last_valid_lsn, if the WAL receiver has received some WAL
|
||||
// after the end of last record, but not the whole next record yet. In the
|
||||
// page cache, we care about last_valid_lsn, but if the WAL receiver needs to
|
||||
// restart the streaming, it needs to restart at the end of last record, so
|
||||
// we track them separately. last_record_lsn should perhaps be in
|
||||
// walreceiver.rs instead of here, but it seems convenient to keep all three
|
||||
// values together.
|
||||
//
|
||||
last_valid_lsn: SeqWait<Lsn>,
|
||||
last_record_lsn: AtomicLsn,
|
||||
|
||||
// Counters, for metrics collection.
|
||||
pub num_entries: AtomicU64,
|
||||
pub num_page_images: AtomicU64,
|
||||
pub num_wal_records: AtomicU64,
|
||||
pub num_getpage_requests: AtomicU64,
|
||||
}
|
||||
|
||||
//
|
||||
// We store two kinds of entries in the repository:
|
||||
//
|
||||
// 1. Ready-made images of the block
|
||||
// 2. WAL records, to be applied on top of the "previous" entry
|
||||
//
|
||||
// Some WAL records will initialize the page from scratch. For such records,
|
||||
// the 'will_init' flag is set. They don't need the previous page image before
|
||||
// applying. The 'will_init' flag is set for records containing a full-page image,
|
||||
// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
|
||||
// stored directly in the cache entry in that you still need to run the WAL redo
|
||||
// routine to generate the page image.
|
||||
//
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
struct CacheKey {
|
||||
pub tag: BufferTag,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl CacheKey {
|
||||
fn pack(&self, buf: &mut BytesMut) {
|
||||
self.tag.pack(buf);
|
||||
buf.put_u64(self.lsn.0);
|
||||
}
|
||||
fn unpack(buf: &mut Bytes) -> CacheKey {
|
||||
CacheKey {
|
||||
tag: BufferTag::unpack(buf),
|
||||
lsn: Lsn::from(buf.get_u64()),
|
||||
}
|
||||
}
|
||||
|
||||
fn from_slice(slice: &[u8]) -> Self {
|
||||
let mut buf = Bytes::copy_from_slice(slice);
|
||||
Self::unpack(&mut buf)
|
||||
}
|
||||
|
||||
fn to_bytes(&self) -> BytesMut {
|
||||
let mut buf = BytesMut::new();
|
||||
self.pack(&mut buf);
|
||||
buf
|
||||
}
|
||||
}
|
||||
|
||||
enum CacheEntryContent {
|
||||
PageImage(Bytes),
|
||||
WALRecord(WALRecord),
|
||||
Truncation,
|
||||
}
|
||||
|
||||
// The serialized representation of a CacheEntryContent begins with
|
||||
// single byte that indicates what kind of entry it is. There is also
|
||||
// an UNUSED_VERSION_FLAG that is not represented in the CacheEntryContent
|
||||
// at all, you must peek into the first byte of the serialized representation
|
||||
// to read it.
|
||||
const CONTENT_PAGE_IMAGE: u8 = 1u8;
|
||||
const CONTENT_WAL_RECORD: u8 = 2u8;
|
||||
const CONTENT_TRUNCATION: u8 = 3u8;
|
||||
|
||||
const CONTENT_KIND_MASK: u8 = 3u8; // bitmask that covers the above
|
||||
|
||||
const UNUSED_VERSION_FLAG: u8 = 4u8;
|
||||
|
||||
impl CacheEntryContent {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
match self {
|
||||
CacheEntryContent::PageImage(image) => {
|
||||
buf.put_u8(CONTENT_PAGE_IMAGE);
|
||||
buf.put_u16(image.len() as u16);
|
||||
buf.put_slice(&image[..]);
|
||||
}
|
||||
CacheEntryContent::WALRecord(rec) => {
|
||||
buf.put_u8(CONTENT_WAL_RECORD);
|
||||
rec.pack(buf);
|
||||
}
|
||||
CacheEntryContent::Truncation => {
|
||||
buf.put_u8(CONTENT_TRUNCATION);
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> CacheEntryContent {
|
||||
let kind = buf.get_u8() & CONTENT_KIND_MASK;
|
||||
|
||||
match kind {
|
||||
CONTENT_PAGE_IMAGE => {
|
||||
let len = buf.get_u16() as usize;
|
||||
let mut dst = vec![0u8; len];
|
||||
buf.copy_to_slice(&mut dst);
|
||||
CacheEntryContent::PageImage(Bytes::from(dst))
|
||||
}
|
||||
CONTENT_WAL_RECORD => CacheEntryContent::WALRecord(WALRecord::unpack(buf)),
|
||||
CONTENT_TRUNCATION => CacheEntryContent::Truncation,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
fn from_slice(slice: &[u8]) -> Self {
|
||||
let mut buf = Bytes::copy_from_slice(slice);
|
||||
Self::unpack(&mut buf)
|
||||
}
|
||||
|
||||
fn to_bytes(&self) -> BytesMut {
|
||||
let mut buf = BytesMut::new();
|
||||
self.pack(&mut buf);
|
||||
buf
|
||||
}
|
||||
}
|
||||
|
||||
impl RocksRepository {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
walredo_mgr: Arc<dyn WalRedoManager>,
|
||||
) -> RocksRepository {
|
||||
RocksRepository {
|
||||
conf: conf,
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
walredo_mgr,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get handle to a given timeline. It is assumed to already exist.
|
||||
impl Repository for RocksRepository {
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
|
||||
match timelines.get(&timelineid) {
|
||||
Some(timeline) => Ok(timeline.clone()),
|
||||
None => bail!("timeline not found"),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_or_restore_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
match timelines.get(&timelineid) {
|
||||
Some(timeline) => Ok(timeline.clone()),
|
||||
None => {
|
||||
let timeline = RocksTimeline::new(self.conf, timelineid, self.walredo_mgr.clone());
|
||||
|
||||
restore_timeline(self.conf, &timeline, timelineid)?;
|
||||
|
||||
let timeline_rc = Arc::new(timeline);
|
||||
|
||||
timelines.insert(timelineid, timeline_rc.clone());
|
||||
|
||||
if self.conf.gc_horizon != 0 {
|
||||
let timeline_rc_copy = timeline_rc.clone();
|
||||
let conf = self.conf;
|
||||
let _gc_thread = thread::Builder::new()
|
||||
.name("Garbage collection thread".into())
|
||||
.spawn(move || {
|
||||
// FIXME
|
||||
timeline_rc_copy.do_gc(conf).expect("GC thread died");
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
Ok(timeline_rc)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
let timeline = RocksTimeline::new(&self.conf, timelineid, self.walredo_mgr.clone());
|
||||
|
||||
let timeline_rc = Arc::new(timeline);
|
||||
let r = timelines.insert(timelineid, timeline_rc.clone());
|
||||
assert!(r.is_none());
|
||||
|
||||
// don't start the garbage collector for unit tests, either.
|
||||
|
||||
Ok(timeline_rc)
|
||||
}
|
||||
}
|
||||
|
||||
impl RocksTimeline {
|
||||
fn open_rocksdb(conf: &PageServerConf, timelineid: ZTimelineId) -> rocksdb::DB {
|
||||
let path = conf.timeline_path(timelineid);
|
||||
let mut opts = rocksdb::Options::default();
|
||||
opts.create_if_missing(true);
|
||||
opts.set_use_fsync(true);
|
||||
opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
|
||||
opts.set_compaction_filter("ttl", move |_level: u32, _key: &[u8], val: &[u8]| {
|
||||
if (val[0] & UNUSED_VERSION_FLAG) != 0 {
|
||||
rocksdb::compaction_filter::Decision::Remove
|
||||
} else {
|
||||
rocksdb::compaction_filter::Decision::Keep
|
||||
}
|
||||
});
|
||||
rocksdb::DB::open(&opts, &path).unwrap()
|
||||
}
|
||||
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
walredo_mgr: Arc<dyn WalRedoManager>,
|
||||
) -> RocksTimeline {
|
||||
RocksTimeline {
|
||||
db: RocksTimeline::open_rocksdb(conf, timelineid),
|
||||
|
||||
walredo_mgr,
|
||||
|
||||
last_valid_lsn: SeqWait::new(Lsn(0)),
|
||||
last_record_lsn: AtomicLsn::new(0),
|
||||
|
||||
num_entries: AtomicU64::new(0),
|
||||
num_page_images: AtomicU64::new(0),
|
||||
num_wal_records: AtomicU64::new(0),
|
||||
num_getpage_requests: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RocksTimeline {
|
||||
///
|
||||
/// Collect all the WAL records that are needed to reconstruct a page
|
||||
/// image for the given cache entry.
|
||||
///
|
||||
/// Returns an old page image (if any), and a vector of WAL records to apply
|
||||
/// over it.
|
||||
///
|
||||
fn collect_records_for_apply(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
) -> (Option<Bytes>, Vec<WALRecord>) {
|
||||
let key = CacheKey { tag, lsn };
|
||||
let mut base_img: Option<Bytes> = None;
|
||||
let mut records: Vec<WALRecord> = Vec::new();
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(key.to_bytes());
|
||||
|
||||
// Scan backwards, collecting the WAL records, until we hit an
|
||||
// old page image.
|
||||
while iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag != tag {
|
||||
break;
|
||||
}
|
||||
let content = CacheEntryContent::from_slice(iter.value().unwrap());
|
||||
if let CacheEntryContent::PageImage(img) = content {
|
||||
// We have a base image. No need to dig deeper into the list of
|
||||
// records
|
||||
base_img = Some(img);
|
||||
break;
|
||||
} else if let CacheEntryContent::WALRecord(rec) = content {
|
||||
records.push(rec.clone());
|
||||
// If this WAL record initializes the page, no need to dig deeper.
|
||||
if rec.will_init {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
panic!("no base image and no WAL record on cache entry");
|
||||
}
|
||||
iter.prev();
|
||||
}
|
||||
records.reverse();
|
||||
(base_img, records)
|
||||
}
|
||||
|
||||
// Internal functions
|
||||
|
||||
//
|
||||
// Internal function to get relation size at given LSN.
|
||||
//
|
||||
// The caller must ensure that WAL has been received up to 'lsn'.
|
||||
//
|
||||
fn relsize_get_nowait(&self, rel: RelTag, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn <= self.last_valid_lsn.load());
|
||||
|
||||
let mut key = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel,
|
||||
blknum: u32::MAX,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
loop {
|
||||
iter.seek_for_prev(key.to_bytes());
|
||||
if iter.valid() {
|
||||
let thiskey = CacheKey::from_slice(iter.key().unwrap());
|
||||
if thiskey.tag.rel == rel {
|
||||
let content = CacheEntryContent::from_slice(iter.value().unwrap());
|
||||
if let CacheEntryContent::Truncation = content {
|
||||
if thiskey.tag.blknum > 0 {
|
||||
key.tag.blknum = thiskey.tag.blknum - 1;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
let relsize = thiskey.tag.blknum + 1;
|
||||
debug!("Size of relation {} at {} is {}", rel, lsn, relsize);
|
||||
return Ok(relsize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
debug!("Size of relation {} at {} is zero", rel, lsn);
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
fn do_gc(&self, conf: &'static PageServerConf) -> Result<Bytes> {
|
||||
loop {
|
||||
thread::sleep(conf.gc_period);
|
||||
let last_lsn = self.get_last_valid_lsn();
|
||||
|
||||
// checked_sub() returns None on overflow.
|
||||
if let Some(horizon) = last_lsn.checked_sub(conf.gc_horizon) {
|
||||
let mut maxkey = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: u32::MAX,
|
||||
dbnode: u32::MAX,
|
||||
relnode: u32::MAX,
|
||||
forknum: u8::MAX,
|
||||
},
|
||||
blknum: u32::MAX,
|
||||
},
|
||||
lsn: Lsn::MAX,
|
||||
};
|
||||
let now = Instant::now();
|
||||
let mut reconstructed = 0u64;
|
||||
let mut truncated = 0u64;
|
||||
let mut inspected = 0u64;
|
||||
let mut deleted = 0u64;
|
||||
loop {
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(maxkey.to_bytes());
|
||||
if iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
let v = iter.value().unwrap();
|
||||
|
||||
inspected += 1;
|
||||
|
||||
// Construct boundaries for old records cleanup
|
||||
maxkey.tag = key.tag;
|
||||
let last_lsn = key.lsn;
|
||||
maxkey.lsn = min(horizon, last_lsn); // do not remove last version
|
||||
|
||||
let mut minkey = maxkey.clone();
|
||||
minkey.lsn = Lsn(0); // first version
|
||||
|
||||
// Special handling of delete of PREPARE WAL record
|
||||
if last_lsn < horizon
|
||||
&& key.tag.rel.forknum == pg_constants::PG_TWOPHASE_FORKNUM
|
||||
{
|
||||
if (v[0] & UNUSED_VERSION_FLAG) == 0 {
|
||||
let mut v = v.to_owned();
|
||||
v[0] |= UNUSED_VERSION_FLAG;
|
||||
self.db.put(key.to_bytes(), &v[..])?;
|
||||
deleted += 1;
|
||||
}
|
||||
maxkey = minkey;
|
||||
continue;
|
||||
}
|
||||
// reconstruct most recent page version
|
||||
if (v[0] & CONTENT_KIND_MASK) == CONTENT_WAL_RECORD {
|
||||
// force reconstruction of most recent page version
|
||||
let (base_img, records) =
|
||||
self.collect_records_for_apply(key.tag, key.lsn);
|
||||
|
||||
trace!(
|
||||
"Reconstruct most recent page {} blk {} at {} from {} records",
|
||||
key.tag.rel,
|
||||
key.tag.blknum,
|
||||
key.lsn,
|
||||
records.len()
|
||||
);
|
||||
|
||||
let new_img = self
|
||||
.walredo_mgr
|
||||
.request_redo(key.tag, key.lsn, base_img, records)?;
|
||||
self.put_page_image(key.tag, key.lsn, new_img.clone());
|
||||
|
||||
reconstructed += 1;
|
||||
}
|
||||
|
||||
iter.seek_for_prev(maxkey.to_bytes());
|
||||
if iter.valid() {
|
||||
// do not remove last version
|
||||
if last_lsn > horizon {
|
||||
// locate most recent record before horizon
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag == maxkey.tag {
|
||||
let v = iter.value().unwrap();
|
||||
if (v[0] & CONTENT_KIND_MASK) == CONTENT_WAL_RECORD {
|
||||
let (base_img, records) =
|
||||
self.collect_records_for_apply(key.tag, key.lsn);
|
||||
trace!("Reconstruct horizon page {} blk {} at {} from {} records",
|
||||
key.tag.rel, key.tag.blknum, key.lsn, records.len());
|
||||
let new_img = self
|
||||
.walredo_mgr
|
||||
.request_redo(key.tag, key.lsn, base_img, records)?;
|
||||
self.put_page_image(key.tag, key.lsn, new_img.clone());
|
||||
|
||||
truncated += 1;
|
||||
} else {
|
||||
trace!(
|
||||
"Keeping horizon page {} blk {} at {}",
|
||||
key.tag.rel,
|
||||
key.tag.blknum,
|
||||
key.lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
trace!(
|
||||
"Last page {} blk {} at {}, horizon {}",
|
||||
key.tag.rel,
|
||||
key.tag.blknum,
|
||||
key.lsn,
|
||||
horizon
|
||||
);
|
||||
}
|
||||
// remove records prior to horizon
|
||||
loop {
|
||||
iter.prev();
|
||||
if !iter.valid() {
|
||||
break;
|
||||
}
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag != maxkey.tag {
|
||||
break;
|
||||
}
|
||||
let v = iter.value().unwrap();
|
||||
if (v[0] & UNUSED_VERSION_FLAG) == 0 {
|
||||
let mut v = v.to_owned();
|
||||
v[0] |= UNUSED_VERSION_FLAG;
|
||||
self.db.put(key.to_bytes(), &v[..])?;
|
||||
deleted += 1;
|
||||
trace!(
|
||||
"deleted: {} blk {} at {}",
|
||||
key.tag.rel,
|
||||
key.tag.blknum,
|
||||
key.lsn
|
||||
);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
maxkey = minkey;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
info!("Garbage collection completed in {:?}:\n{} version chains inspected, {} pages reconstructed, {} version histories truncated, {} versions deleted",
|
||||
now.elapsed(), inspected, reconstructed, truncated, deleted);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Wait until WAL has been received up to the given LSN.
|
||||
//
|
||||
fn wait_lsn(&self, mut lsn: Lsn) -> Result<Lsn> {
|
||||
// When invalid LSN is requested, it means "don't wait, return latest version of the page"
|
||||
// This is necessary for bootstrap.
|
||||
if lsn == Lsn(0) {
|
||||
let last_valid_lsn = self.last_valid_lsn.load();
|
||||
trace!(
|
||||
"walreceiver doesn't work yet last_valid_lsn {}, requested {}",
|
||||
last_valid_lsn,
|
||||
lsn
|
||||
);
|
||||
lsn = last_valid_lsn;
|
||||
}
|
||||
//trace!("Start waiting for LSN {}, valid LSN is {}", lsn, self.last_valid_lsn.load());
|
||||
self.last_valid_lsn
|
||||
.wait_for_timeout(lsn, TIMEOUT)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive",
|
||||
lsn
|
||||
)
|
||||
})?;
|
||||
//trace!("Stop waiting for LSN {}, valid LSN is {}", lsn, self.last_valid_lsn.load());
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline for RocksTimeline {
|
||||
// Public GET interface functions
|
||||
|
||||
///
|
||||
/// GetPage@LSN
|
||||
///
|
||||
/// Returns an 8k page image
|
||||
///
|
||||
fn get_page_at_lsn(&self, tag: BufferTag, req_lsn: Lsn) -> Result<Bytes> {
|
||||
self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let lsn = self.wait_lsn(req_lsn)?;
|
||||
|
||||
// Look up cache entry. If it's a page image, return that. If it's a WAL record,
|
||||
// ask the WAL redo service to reconstruct the page image from the WAL records.
|
||||
let key = CacheKey { tag, lsn };
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(key.to_bytes());
|
||||
|
||||
if iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag == tag {
|
||||
let content = CacheEntryContent::from_slice(iter.value().unwrap());
|
||||
let page_img: Bytes;
|
||||
if let CacheEntryContent::PageImage(img) = content {
|
||||
page_img = img;
|
||||
} else if let CacheEntryContent::WALRecord(_rec) = content {
|
||||
// Request the WAL redo manager to apply the WAL records for us.
|
||||
let (base_img, records) = self.collect_records_for_apply(tag, lsn);
|
||||
page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;
|
||||
|
||||
self.put_page_image(tag, lsn, page_img.clone());
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
// FIXME: assumes little-endian. Only used for the debugging log though
|
||||
let page_lsn_hi =
|
||||
u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
|
||||
let page_lsn_lo =
|
||||
u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
|
||||
debug!(
|
||||
"Returning page with LSN {:X}/{:X} for {} blk {}",
|
||||
page_lsn_hi, page_lsn_lo, tag.rel, tag.blknum
|
||||
);
|
||||
return Ok(page_img);
|
||||
}
|
||||
}
|
||||
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
debug!(
|
||||
"Page {} blk {} at {}({}) not found",
|
||||
tag.rel, tag.blknum, req_lsn, lsn
|
||||
);
|
||||
Ok(Bytes::from_static(&ZERO_PAGE))
|
||||
/* return Err("could not find page image")?; */
|
||||
}
|
||||
|
||||
///
|
||||
/// Get size of relation at given LSN.
|
||||
///
|
||||
fn get_relsize(&self, rel: RelTag, lsn: Lsn) -> Result<u32> {
|
||||
let lsn = self.wait_lsn(lsn)?;
|
||||
self.relsize_get_nowait(rel, lsn)
|
||||
}
|
||||
|
||||
/// Get vector of prepared twophase transactions
|
||||
fn get_twophase(&self, lsn: Lsn) -> Result<Vec<TransactionId>> {
|
||||
let key = CacheKey {
|
||||
// minimal key
|
||||
tag: BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::PG_TWOPHASE_FORKNUM,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
},
|
||||
blknum: 0,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let mut gxacts = Vec::new();
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(key.to_bytes());
|
||||
while iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag.rel.forknum != pg_constants::PG_TWOPHASE_FORKNUM {
|
||||
break; // we are done with this fork
|
||||
}
|
||||
if key.lsn <= lsn {
|
||||
let xid = key.tag.blknum;
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::PG_XACT_FORKNUM,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
},
|
||||
blknum: xid / pg_constants::CLOG_XACTS_PER_PAGE,
|
||||
};
|
||||
let clog_page = self.get_page_at_lsn(tag, lsn)?;
|
||||
let status = transaction_id_get_status(xid, &clog_page[..]);
|
||||
if status == pg_constants::TRANSACTION_STATUS_IN_PROGRESS {
|
||||
gxacts.push(xid);
|
||||
}
|
||||
}
|
||||
iter.next();
|
||||
}
|
||||
return Ok(gxacts);
|
||||
}
|
||||
|
||||
/// Get databases. This function is used to local pg_filenode.map files
|
||||
fn get_databases(&self, lsn: Lsn) -> Result<Vec<RelTag>> {
|
||||
let key = CacheKey {
|
||||
// minimal key
|
||||
tag: BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
},
|
||||
blknum: 0,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let mut dbs = Vec::new();
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(key.to_bytes());
|
||||
let mut prev_tag = key.tag.rel;
|
||||
while iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag.rel.forknum != pg_constants::PG_FILENODEMAP_FORKNUM {
|
||||
break; // we are done with this fork
|
||||
}
|
||||
if key.tag.rel != prev_tag && key.lsn <= lsn {
|
||||
prev_tag = key.tag.rel;
|
||||
dbs.push(prev_tag); // collect unique tags
|
||||
}
|
||||
iter.next();
|
||||
}
|
||||
return Ok(dbs);
|
||||
}
|
||||
|
||||
/// Get range [begin,end) of stored blocks. Used mostly for SMGR pseudorelations
|
||||
/// but can be also applied to normal relations.
|
||||
fn get_range(&self, rel: RelTag, lsn: Lsn) -> Result<(u32, u32)> {
|
||||
let _lsn = self.wait_lsn(lsn)?;
|
||||
let mut key = CacheKey {
|
||||
// minimal key to start with
|
||||
tag: BufferTag { rel, blknum: 0 },
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(key.to_bytes()); // locate first entry
|
||||
if iter.valid() {
|
||||
let thiskey = CacheKey::from_slice(iter.key().unwrap());
|
||||
let tag = thiskey.tag;
|
||||
if tag.rel == rel {
|
||||
// still trversing this relation
|
||||
let first_blknum = tag.blknum;
|
||||
key.tag.blknum = u32::MAX; // maximal key
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(key.to_bytes()); // localte last entry
|
||||
if iter.valid() {
|
||||
let thiskey = CacheKey::from_slice(iter.key().unwrap());
|
||||
let last_blknum = thiskey.tag.blknum;
|
||||
return Ok((first_blknum, last_blknum + 1)); // upper boundary is exclusive
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok((0, 0)) // empty range
|
||||
}
|
||||
|
||||
///
|
||||
/// Does relation exist at given LSN?
|
||||
///
|
||||
/// FIXME: this actually returns true, if the relation exists at *any* LSN
|
||||
fn get_relsize_exists(&self, rel: RelTag, req_lsn: Lsn) -> Result<bool> {
|
||||
let lsn = self.wait_lsn(req_lsn)?;
|
||||
|
||||
let key = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel,
|
||||
blknum: u32::MAX,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(key.to_bytes());
|
||||
if iter.valid() {
|
||||
let key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag.rel == rel {
|
||||
debug!("Relation {} exists at {}", rel, lsn);
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
debug!("Relation {} doesn't exist at {}", rel, lsn);
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
// Other public functions, for updating the repository.
|
||||
// These are used by the WAL receiver and WAL redo.
|
||||
|
||||
///
|
||||
/// Adds a WAL record to the repository
|
||||
///
|
||||
fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) {
|
||||
let lsn = rec.lsn;
|
||||
let key = CacheKey { tag, lsn };
|
||||
|
||||
let content = CacheEntryContent::WALRecord(rec);
|
||||
|
||||
let _res = self.db.put(key.to_bytes(), content.to_bytes());
|
||||
trace!(
|
||||
"put_wal_record rel {} blk {} at {}",
|
||||
tag.rel,
|
||||
tag.blknum,
|
||||
lsn
|
||||
);
|
||||
|
||||
self.num_entries.fetch_add(1, Ordering::Relaxed);
|
||||
self.num_wal_records.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
///
|
||||
/// Adds a relation-wide WAL record (like truncate) to the repository,
|
||||
/// associating it with all pages started with specified block number
|
||||
///
|
||||
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()> {
|
||||
// What was the size of the relation before this record?
|
||||
let last_lsn = self.last_valid_lsn.load();
|
||||
let old_rel_size = self.relsize_get_nowait(rel, last_lsn)?;
|
||||
|
||||
let content = CacheEntryContent::Truncation;
|
||||
// set new relation size
|
||||
trace!("Truncate relation {} to {} blocks at {}", rel, nblocks, lsn);
|
||||
|
||||
for blknum in nblocks..old_rel_size {
|
||||
let key = CacheKey {
|
||||
tag: BufferTag { rel, blknum },
|
||||
lsn,
|
||||
};
|
||||
trace!("put_wal_record lsn: {}", key.lsn);
|
||||
let _res = self.db.put(key.to_bytes(), content.to_bytes());
|
||||
}
|
||||
let n = (old_rel_size - nblocks) as u64;
|
||||
self.num_entries.fetch_add(n, Ordering::Relaxed);
|
||||
self.num_wal_records.fetch_add(n, Ordering::Relaxed);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Get page image at particular LSN
|
||||
///
|
||||
fn get_page_image(&self, tag: BufferTag, lsn: Lsn) -> Result<Option<Bytes>> {
|
||||
let key = CacheKey { tag, lsn };
|
||||
if let Some(bytes) = self.db.get(key.to_bytes())? {
|
||||
let content = CacheEntryContent::from_slice(&bytes);
|
||||
if let CacheEntryContent::PageImage(img) = content {
|
||||
return Ok(Some(img));
|
||||
}
|
||||
}
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
///
|
||||
/// Memorize a full image of a page version
|
||||
///
|
||||
fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes) {
|
||||
let img_len = img.len();
|
||||
let key = CacheKey { tag, lsn };
|
||||
let content = CacheEntryContent::PageImage(img);
|
||||
|
||||
let mut val_buf = content.to_bytes();
|
||||
|
||||
// Zero size of page image indicates that page can be removed
|
||||
if img_len == 0 {
|
||||
if (val_buf[0] & UNUSED_VERSION_FLAG) != 0 {
|
||||
// records already marked for deletion
|
||||
return;
|
||||
} else {
|
||||
// delete truncated multixact page
|
||||
val_buf[0] |= UNUSED_VERSION_FLAG;
|
||||
}
|
||||
}
|
||||
|
||||
trace!("put_wal_record lsn: {}", key.lsn);
|
||||
let _res = self.db.put(key.to_bytes(), content.to_bytes());
|
||||
|
||||
trace!(
|
||||
"put_page_image rel {} blk {} at {}",
|
||||
tag.rel,
|
||||
tag.blknum,
|
||||
lsn
|
||||
);
|
||||
self.num_page_images.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn put_create_database(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
db_id: Oid,
|
||||
tablespace_id: Oid,
|
||||
src_db_id: Oid,
|
||||
src_tablespace_id: Oid,
|
||||
) -> Result<()> {
|
||||
let mut n = 0;
|
||||
for forknum in &[
|
||||
pg_constants::MAIN_FORKNUM,
|
||||
pg_constants::FSM_FORKNUM,
|
||||
pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
pg_constants::INIT_FORKNUM,
|
||||
pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
] {
|
||||
let key = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: src_tablespace_id,
|
||||
dbnode: src_db_id,
|
||||
relnode: 0,
|
||||
forknum: *forknum,
|
||||
},
|
||||
blknum: 0,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(key.to_bytes());
|
||||
while iter.valid() {
|
||||
let mut key = CacheKey::from_slice(iter.key().unwrap());
|
||||
if key.tag.rel.spcnode != src_tablespace_id || key.tag.rel.dbnode != src_db_id {
|
||||
break;
|
||||
}
|
||||
key.tag.rel.spcnode = tablespace_id;
|
||||
key.tag.rel.dbnode = db_id;
|
||||
key.lsn = lsn;
|
||||
|
||||
let v = iter.value().unwrap();
|
||||
self.db.put(key.to_bytes(), v)?;
|
||||
n += 1;
|
||||
iter.next();
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Create database {}/{}, copy {} entries",
|
||||
tablespace_id, db_id, n
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember that WAL has been received and added to the timeline up to the given LSN
|
||||
fn advance_last_valid_lsn(&self, lsn: Lsn) {
|
||||
let lsn = Lsn((lsn.0 + 7) & !7); // align position on 8 bytes
|
||||
let old = self.last_valid_lsn.advance(lsn);
|
||||
|
||||
// Can't move backwards.
|
||||
if lsn < old {
|
||||
warn!(
|
||||
"attempted to move last valid LSN backwards (was {}, new {})",
|
||||
old, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Remember the (end of) last valid WAL record remembered for the timeline.
|
||||
///
|
||||
/// NOTE: this updates last_valid_lsn as well.
|
||||
///
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn) {
|
||||
let lsn = Lsn((lsn.0 + 7) & !7); // align position on 8 bytes
|
||||
// Can't move backwards.
|
||||
let old = self.last_record_lsn.fetch_max(lsn);
|
||||
assert!(old <= lsn);
|
||||
|
||||
// Also advance last_valid_lsn
|
||||
let old = self.last_valid_lsn.advance(lsn);
|
||||
// Can't move backwards.
|
||||
if lsn < old {
|
||||
warn!(
|
||||
"attempted to move last record LSN backwards (was {}, new {})",
|
||||
old, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_last_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load()
|
||||
}
|
||||
|
||||
fn init_valid_lsn(&self, lsn: Lsn) {
|
||||
let old = self.last_valid_lsn.advance(lsn);
|
||||
assert!(old == Lsn(0));
|
||||
let old = self.last_record_lsn.fetch_max(lsn);
|
||||
assert!(old == Lsn(0));
|
||||
}
|
||||
|
||||
fn get_last_valid_lsn(&self) -> Lsn {
|
||||
self.last_valid_lsn.load()
|
||||
}
|
||||
|
||||
//
|
||||
// Get statistics to be displayed in the user interface.
|
||||
//
|
||||
// FIXME
|
||||
/*
|
||||
fn get_stats(&self) -> TimelineStats {
|
||||
TimelineStats {
|
||||
num_entries: self.num_entries.load(Ordering::Relaxed),
|
||||
num_page_images: self.num_page_images.load(Ordering::Relaxed),
|
||||
num_wal_records: self.num_wal_records.load(Ordering::Relaxed),
|
||||
num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
@@ -1,17 +1,9 @@
|
||||
//
|
||||
// Restore chunks from local Zenith repository
|
||||
//
|
||||
// This runs once at Page Server startup. It loads all the "snapshots" and all
|
||||
// WAL from all timelines from the local zenith repository into the in-memory page
|
||||
// cache.
|
||||
//
|
||||
// This also initializes the "last valid LSN" in the page cache to the last LSN
|
||||
// seen in the WAL, so that when the WAL receiver is started, it starts
|
||||
// streaming from that LSN.
|
||||
//
|
||||
|
||||
//!
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! zenith Timeline.
|
||||
//!
|
||||
use log::*;
|
||||
use std::cmp::max;
|
||||
use std::cmp::{max, min};
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
@@ -20,63 +12,20 @@ use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use crate::repository::{BufferTag, RelTag, Timeline};
|
||||
use crate::waldecoder::{decode_wal_record, Oid, WalStreamDecoder};
|
||||
use crate::object_key::*;
|
||||
use crate::repository::*;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_ffi::{pg_constants, CheckPoint, ControlFileData};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Load all WAL and all relation data pages from local disk into the repository.
|
||||
///
|
||||
pub fn restore_timeline(
|
||||
conf: &PageServerConf,
|
||||
timeline: &dyn Timeline,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<()> {
|
||||
let timelinepath = PathBuf::from("timelines").join(timelineid.to_string());
|
||||
|
||||
if !timelinepath.exists() {
|
||||
anyhow::bail!("timeline {} does not exist in the page server's repository");
|
||||
}
|
||||
|
||||
// Scan .zenith/timelines/<timeline>/snapshots
|
||||
let snapshotspath = PathBuf::from("timelines")
|
||||
.join(timelineid.to_string())
|
||||
.join("snapshots");
|
||||
|
||||
let mut last_snapshot_lsn: Lsn = Lsn(0);
|
||||
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let direntry = direntry?;
|
||||
let filename = direntry.file_name();
|
||||
let lsn = Lsn::from_filename(&filename)?;
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
|
||||
// FIXME: pass filename as Path instead of str?
|
||||
let filename_str = filename.into_string().unwrap();
|
||||
restore_snapshot(conf, timeline, timelineid, &filename_str)?;
|
||||
info!("restored snapshot at {:?}", filename_str);
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == Lsn(0) {
|
||||
error!(
|
||||
"could not find valid snapshot in {}",
|
||||
snapshotspath.display()
|
||||
);
|
||||
// TODO return error?
|
||||
}
|
||||
timeline.init_valid_lsn(last_snapshot_lsn);
|
||||
|
||||
restore_wal(timeline, timelineid, last_snapshot_lsn)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
const MAX_MBR_BLKNO: u32 =
|
||||
pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
|
||||
///
|
||||
/// Find latest snapshot in a timeline's 'snapshots' directory
|
||||
@@ -102,61 +51,54 @@ pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Re
|
||||
Ok(last_snapshot_lsn)
|
||||
}
|
||||
|
||||
fn restore_snapshot(
|
||||
conf: &PageServerConf,
|
||||
///
|
||||
/// Import all relation data pages from local disk into the repository.
|
||||
///
|
||||
pub fn import_timeline_from_postgres_datadir(
|
||||
path: &Path,
|
||||
timeline: &dyn Timeline,
|
||||
timelineid: ZTimelineId,
|
||||
snapshot: &str,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
let snapshotpath = PathBuf::from("timelines")
|
||||
.join(timelineid.to_string())
|
||||
.join("snapshots")
|
||||
.join(snapshot);
|
||||
|
||||
// Scan 'global'
|
||||
for direntry in fs::read_dir(snapshotpath.join("global"))? {
|
||||
for direntry in fs::read_dir(path.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("pg_control") => restore_nonrel_file(
|
||||
conf,
|
||||
Some("pg_control") => {
|
||||
import_nonrel_file(timeline, lsn, ObjectTag::ControlFile, &direntry.path())?;
|
||||
// Extract checkpoint record from pg_control and store is as separate object
|
||||
let pg_control_bytes =
|
||||
timeline.get_page_at_lsn_nowait(ObjectTag::ControlFile, lsn, false)?;
|
||||
let pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
timeline.put_page_image(ObjectTag::Checkpoint, lsn, checkpoint_bytes, false)?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
timelineid,
|
||||
"0",
|
||||
0,
|
||||
0,
|
||||
pg_constants::PG_CONTROLFILE_FORKNUM,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
Some("pg_filenode.map") => restore_nonrel_file(
|
||||
conf,
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
0,
|
||||
lsn,
|
||||
ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
}),
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => restore_relfile(
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
timeline,
|
||||
snapshot,
|
||||
lsn,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
}
|
||||
}
|
||||
|
||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
||||
// E.g. 'base/12345', where 12345 is the database OID.
|
||||
for direntry in fs::read_dir(snapshotpath.join("base"))? {
|
||||
for direntry in fs::read_dir(path.join("base"))? {
|
||||
let direntry = direntry?;
|
||||
|
||||
let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;
|
||||
@@ -168,91 +110,79 @@ fn restore_snapshot(
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => restore_nonrel_file(
|
||||
conf,
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
0,
|
||||
lsn,
|
||||
ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode: dboid,
|
||||
}),
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => restore_relfile(
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
timeline,
|
||||
snapshot,
|
||||
lsn,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
for entry in fs::read_dir(snapshotpath.join("pg_xact"))? {
|
||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
restore_slru_file(
|
||||
conf,
|
||||
import_slru_file(
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
pg_constants::PG_XACT_FORKNUM,
|
||||
lsn,
|
||||
|blknum| ObjectTag::Clog(SlruBufferTag { blknum }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
for entry in fs::read_dir(snapshotpath.join("pg_multixact").join("members"))? {
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
restore_slru_file(
|
||||
conf,
|
||||
import_slru_file(
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
pg_constants::PG_MXACT_MEMBERS_FORKNUM,
|
||||
lsn,
|
||||
|blknum| ObjectTag::MultiXactMembers(SlruBufferTag { blknum }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
for entry in fs::read_dir(snapshotpath.join("pg_multixact").join("offsets"))? {
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
restore_slru_file(
|
||||
conf,
|
||||
import_slru_file(
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
pg_constants::PG_MXACT_OFFSETS_FORKNUM,
|
||||
lsn,
|
||||
|blknum| ObjectTag::MultiXactOffsets(SlruBufferTag { blknum }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
for entry in fs::read_dir(snapshotpath.join("pg_twophase"))? {
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(&entry.path().to_str().unwrap(), 16)?;
|
||||
restore_nonrel_file(
|
||||
conf,
|
||||
import_nonrel_file(
|
||||
timeline,
|
||||
timelineid,
|
||||
snapshot,
|
||||
0,
|
||||
0,
|
||||
pg_constants::PG_TWOPHASE_FORKNUM,
|
||||
xid,
|
||||
lsn,
|
||||
ObjectTag::TwoPhase(PrepareTag { xid }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
timeline.checkpoint()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_relfile(
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_relfile(
|
||||
path: &Path,
|
||||
timeline: &dyn Timeline,
|
||||
snapshot: &str,
|
||||
lsn: Lsn,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = Lsn::from_hex(snapshot)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
@@ -270,7 +200,7 @@ fn restore_relfile(
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = BufferTag {
|
||||
let tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
@@ -278,13 +208,8 @@ fn restore_relfile(
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
|
||||
/*
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
*/
|
||||
});
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf), true)?;
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
@@ -306,75 +231,43 @@ fn restore_relfile(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_nonrel_file(
|
||||
_conf: &PageServerConf,
|
||||
fn import_nonrel_file(
|
||||
timeline: &dyn Timeline,
|
||||
_timelineid: ZTimelineId,
|
||||
snapshot: &str,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
forknum: u8,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
tag: ObjectTag,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = Lsn::from_hex(snapshot)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode: 0,
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]));
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]), false)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_slru_file(
|
||||
_conf: &PageServerConf,
|
||||
fn import_slru_file(
|
||||
timeline: &dyn Timeline,
|
||||
_timelineid: ZTimelineId,
|
||||
snapshot: &str,
|
||||
forknum: u8,
|
||||
lsn: Lsn,
|
||||
gen_tag: fn(blknum: u32) -> ObjectTag,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = Lsn::from_hex(snapshot)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
|
||||
let mut blknum: u32 = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
|
||||
/*
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
*/
|
||||
timeline.put_page_image(
|
||||
gen_tag(blknum),
|
||||
lsn,
|
||||
Bytes::copy_from_slice(&buf),
|
||||
false,
|
||||
)?;
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
@@ -396,35 +289,27 @@ fn restore_slru_file(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Scan WAL on a timeline, starting from given LSN, and load all the records
|
||||
// into the page cache.
|
||||
fn restore_wal(timeline: &dyn Timeline, timelineid: ZTimelineId, startpoint: Lsn) -> Result<()> {
|
||||
let walpath = format!("timelines/{}/wal", timelineid);
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory, and load all records >= 'startpoint' into
|
||||
/// the repository.
|
||||
pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint: Lsn) -> Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = Lsn(0);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
let mut checkpoint = CheckPoint::new(startpoint.0, 1);
|
||||
let checkpoint_tag = BufferTag::fork(pg_constants::PG_CHECKPOINT_FORKNUM);
|
||||
let pg_control_tag = BufferTag::fork(pg_constants::PG_CONTROLFILE_FORKNUM);
|
||||
if let Some(pg_control_bytes) = timeline.get_page_image(pg_control_tag, Lsn(0))? {
|
||||
let pg_control = decode_pg_control(pg_control_bytes)?;
|
||||
checkpoint = pg_control.checkPointCopy.clone();
|
||||
} else {
|
||||
error!("No control file is found in reposistory");
|
||||
}
|
||||
let checkpoint_bytes =
|
||||
timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, startpoint, false)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
loop {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut path = walpath.clone() + "/" + &filename;
|
||||
let mut path = walpath.join(&filename);
|
||||
|
||||
// It could be as .partial
|
||||
if !PathBuf::from(&path).exists() {
|
||||
path += ".partial";
|
||||
path = walpath.join(filename + ".partial");
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
@@ -455,12 +340,11 @@ fn restore_wal(timeline: &dyn Timeline, timelineid: ZTimelineId, startpoint: Lsn
|
||||
// Assume that an error means we've reached the end of
|
||||
// a partial WAL record. So that's ok.
|
||||
trace!("WAL decoder error {:?}", rec);
|
||||
waldecoder.set_position(Lsn((segno + 1) * pg_constants::WAL_SEGMENT_SIZE as u64));
|
||||
break;
|
||||
}
|
||||
if let Some((lsn, recdata)) = rec.unwrap() {
|
||||
let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
|
||||
timeline.save_decoded_record(decoded, recdata, lsn)?;
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
save_decoded_record(&mut checkpoint, timeline, &decoded, recdata, lsn)?;
|
||||
last_lsn = lsn;
|
||||
} else {
|
||||
break;
|
||||
@@ -469,15 +353,484 @@ fn restore_wal(timeline: &dyn Timeline, timelineid: ZTimelineId, startpoint: Lsn
|
||||
}
|
||||
|
||||
info!(
|
||||
"restored {} records from WAL file {} at {}",
|
||||
nrecords, filename, last_lsn
|
||||
"imported {} records from WAL file {} up to {}",
|
||||
nrecords,
|
||||
path.display(),
|
||||
last_lsn
|
||||
);
|
||||
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
info!("reached end of WAL at {}", last_lsn);
|
||||
let checkpoint_bytes = encode_checkpoint(checkpoint);
|
||||
timeline.put_page_image(checkpoint_tag, Lsn(0), checkpoint_bytes);
|
||||
let checkpoint_bytes = checkpoint.encode();
|
||||
timeline.put_page_image(ObjectTag::Checkpoint, last_lsn, checkpoint_bytes, false)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
|
||||
/// relations/pages that the record affects.
|
||||
///
|
||||
pub fn save_decoded_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
decoded: &DecodedWALRecord,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
checkpoint.update_next_xid(decoded.xl_xid);
|
||||
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
blknum: blk.blkno,
|
||||
});
|
||||
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
}
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
// Handle a few special record types
|
||||
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE
|
||||
{
|
||||
let truncate = XlSmgrTruncate::decode(&mut buf);
|
||||
save_xlog_smgr_truncate(timeline, lsn, &truncate)?;
|
||||
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
|
||||
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE {
|
||||
let createdb = XlCreateDatabase::decode(&mut buf);
|
||||
save_xlog_dbase_create(timeline, lsn, &createdb)?;
|
||||
} else {
|
||||
// TODO
|
||||
trace!("XLOG_DBASE_DROP is not handled yet");
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
|
||||
trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
|
||||
} else if decoded.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let blknum = buf.get_u32_le();
|
||||
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
checkpoint.oldestXid = buf.get_u32_le();
|
||||
checkpoint.oldestXidDB = buf.get_u32_le();
|
||||
trace!(
|
||||
"RM_CLOG_ID truncate blkno {} oldestXid {} oldestXidDB {}",
|
||||
blknum,
|
||||
checkpoint.oldestXid,
|
||||
checkpoint.oldestXidDB
|
||||
);
|
||||
if let Some(ObjectTag::Clog(first_slru_tag)) =
|
||||
timeline.get_next_tag(ObjectTag::Clog(SlruBufferTag { blknum: 0 }))?
|
||||
{
|
||||
for trunc_blknum in first_slru_tag.blknum..=blknum {
|
||||
let tag = ObjectTag::Clog(SlruBufferTag {
|
||||
blknum: trunc_blknum,
|
||||
});
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
|| info == pg_constants::XLOG_XACT_ABORT
|
||||
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
|
||||
save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(
|
||||
ObjectTag::TwoPhase(PrepareTag {
|
||||
xid: decoded.xl_xid,
|
||||
}),
|
||||
rec,
|
||||
)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
|
||||
{
|
||||
let blknum = buf.get_u32_le();
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let tag = if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||
ObjectTag::MultiXactOffsets(SlruBufferTag { blknum })
|
||||
} else {
|
||||
ObjectTag::MultiXactMembers(SlruBufferTag { blknum })
|
||||
};
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||
save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
save_relmap_record(timeline, lsn, &xlrec, decoded)?;
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
checkpoint.nextOid = next_oid;
|
||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
buf.copy_to_slice(&mut checkpoint_bytes);
|
||||
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap();
|
||||
trace!(
|
||||
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
|
||||
xlog_checkpoint.oldestXid,
|
||||
checkpoint.oldestXid
|
||||
);
|
||||
if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
|
||||
checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Now that this record has been handled, let the repository know that
|
||||
// it is up-to-date to this LSN
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record.
|
||||
fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> {
|
||||
let db_id = rec.db_id;
|
||||
let tablespace_id = rec.tablespace_id;
|
||||
let src_db_id = rec.src_db_id;
|
||||
let src_tablespace_id = rec.src_tablespace_id;
|
||||
|
||||
// Creating a database is implemented by copying the template (aka. source) database.
|
||||
// To copy all the relations, we need to ask for the state as of the same LSN, but we
|
||||
// cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
|
||||
// the last valid LSN to advance up to it. So we use the previous record's LSN in the
|
||||
// get calls instead.
|
||||
let req_lsn = min(timeline.get_last_record_lsn(), lsn);
|
||||
|
||||
let rels = timeline.list_rels(src_tablespace_id, src_db_id, req_lsn)?;
|
||||
|
||||
trace!("save_create_database: {} rels", rels.len());
|
||||
|
||||
let mut num_rels_copied = 0;
|
||||
let mut num_blocks_copied = 0;
|
||||
for src_rel in rels {
|
||||
assert_eq!(src_rel.spcnode, src_tablespace_id);
|
||||
assert_eq!(src_rel.dbnode, src_db_id);
|
||||
|
||||
let nblocks = timeline.get_rel_size(src_rel, req_lsn)?;
|
||||
let dst_rel = RelTag {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
relnode: src_rel.relnode,
|
||||
forknum: src_rel.forknum,
|
||||
};
|
||||
|
||||
// Copy content
|
||||
for blknum in 0..nblocks {
|
||||
let src_key = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: src_rel,
|
||||
blknum,
|
||||
});
|
||||
let dst_key = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: dst_rel,
|
||||
blknum,
|
||||
});
|
||||
|
||||
let content = timeline.get_page_at_lsn_nowait(src_key, req_lsn, false)?;
|
||||
|
||||
debug!("copying block {:?} to {:?}", src_key, dst_key);
|
||||
|
||||
timeline.put_page_image(dst_key, lsn, content, true)?;
|
||||
num_blocks_copied += 1;
|
||||
}
|
||||
|
||||
if nblocks == 0 {
|
||||
// make sure we have some trace of the relation, even if it's empty
|
||||
timeline.put_truncation(dst_rel, lsn, 0)?;
|
||||
}
|
||||
|
||||
num_rels_copied += 1;
|
||||
}
|
||||
// Copy relfilemap
|
||||
for tag in timeline.list_nonrels(req_lsn)? {
|
||||
match tag {
|
||||
ObjectTag::FileNodeMap(db) => {
|
||||
if db.spcnode == src_tablespace_id && db.dbnode == src_db_id {
|
||||
let img = timeline.get_page_at_lsn_nowait(tag, req_lsn, false)?;
|
||||
let new_tag = ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
});
|
||||
timeline.put_page_image(new_tag, lsn, img, false)?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => {} // do nothing
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Created database {}/{}, copied {} blocks in {} rels at {}",
|
||||
tablespace_id, db_id, num_blocks_copied, num_rels_copied, lsn
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record.
|
||||
///
|
||||
/// This is the same logic as in PostgreSQL's smgr_redo() function.
|
||||
fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> {
|
||||
let spcnode = rec.rnode.spcnode;
|
||||
let dbnode = rec.rnode.dbnode;
|
||||
let relnode = rec.rnode.relnode;
|
||||
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 {
|
||||
let rel = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum: pg_constants::MAIN_FORKNUM,
|
||||
};
|
||||
timeline.put_truncation(rel, lsn, rec.blkno)?;
|
||||
}
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
|
||||
let rel = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum: pg_constants::FSM_FORKNUM,
|
||||
};
|
||||
|
||||
// FIXME: 'blkno' stored in the WAL record is the new size of the
|
||||
// heap. The formula for calculating the new size of the FSM is
|
||||
// pretty complicated (see FreeSpaceMapPrepareTruncateRel() in
|
||||
// PostgreSQL), and we should also clear bits in the tail FSM block,
|
||||
// and update the upper level FSM pages. None of that has been
|
||||
// implemented. What we do instead, is always just truncate the FSM
|
||||
// to zero blocks. That's bad for performance, but safe. (The FSM
|
||||
// isn't needed for correctness, so we could also leave garbage in
|
||||
// it. Seems more tidy to zap it away.)
|
||||
if rec.blkno != 0 {
|
||||
info!("Partial truncation of FSM is not supported");
|
||||
}
|
||||
let num_fsm_blocks = 0;
|
||||
timeline.put_truncation(rel, lsn, num_fsm_blocks)?;
|
||||
}
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
|
||||
let rel = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
};
|
||||
|
||||
// FIXME: Like with the FSM above, the logic to truncate the VM
|
||||
// correctly has not been implemented. Just zap it away completely,
|
||||
// always. Unlike the FSM, the VM must never have bits incorrectly
|
||||
// set. From a correctness point of view, it's always OK to clear
|
||||
// bits or remove it altogether, though.
|
||||
if rec.blkno != 0 {
|
||||
info!("Partial truncation of VM is not supported");
|
||||
}
|
||||
let num_vm_blocks = 0;
|
||||
timeline.put_truncation(rel, lsn, num_vm_blocks)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
|
||||
///
|
||||
/// We are currently only interested in the dropped relations.
|
||||
fn save_xact_record(
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
parsed: &XlXactParsedRecord,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
// Record update of CLOG page
|
||||
let mut blknum = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
|
||||
for subxact in &parsed.subxacts {
|
||||
let subxact_blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if subxact_blknum != blknum {
|
||||
blknum = subxact_blknum;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
}
|
||||
}
|
||||
for xnode in &parsed.xnodes {
|
||||
for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM {
|
||||
let rel_tag = RelTag {
|
||||
forknum,
|
||||
spcnode: xnode.spcnode,
|
||||
dbnode: xnode.dbnode,
|
||||
relnode: xnode.relnode,
|
||||
};
|
||||
timeline.put_unlink(rel_tag, lsn)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_create_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let blknum = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
|
||||
let first_mbr_blkno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno =
|
||||
(xlrec.moff + xlrec.nmembers - 1) / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
// The members SLRU can, in contrast to the offsets one, be filled to almost
|
||||
// the full range at once. So we need to handle wraparound.
|
||||
let mut blknum = first_mbr_blkno;
|
||||
loop {
|
||||
// Update members page
|
||||
let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
|
||||
if blknum == last_mbr_blkno {
|
||||
// last block inclusive
|
||||
break;
|
||||
}
|
||||
|
||||
// handle wraparound
|
||||
if blknum == MAX_MBR_BLKNO {
|
||||
blknum = 0;
|
||||
} else {
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
if xlrec.mid >= checkpoint.nextMulti {
|
||||
checkpoint.nextMulti = xlrec.mid + 1;
|
||||
}
|
||||
if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
|
||||
checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
|
||||
}
|
||||
let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
|
||||
if mbr.xid.wrapping_sub(acc) as i32 > 0 {
|
||||
mbr.xid
|
||||
} else {
|
||||
acc
|
||||
}
|
||||
});
|
||||
checkpoint.update_next_xid(max_mbr_xid);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
) -> Result<()> {
|
||||
checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
let first_off_blkno = xlrec.start_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let last_off_blkno = xlrec.end_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
// Delete all the segments except the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
for blknum in first_off_blkno..last_off_blkno {
|
||||
let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
}
|
||||
let first_mbr_blkno = xlrec.start_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno = xlrec.end_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
// The members SLRU can, in contrast to the offsets one, be filled to almost
|
||||
// the full range at once. So we need to handle wraparound.
|
||||
let mut blknum = first_mbr_blkno;
|
||||
// Delete all the segments but the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
while blknum != last_mbr_blkno {
|
||||
let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
// handle wraparound
|
||||
if blknum == MAX_MBR_BLKNO {
|
||||
blknum = 0;
|
||||
} else {
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_relmap_record(
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlRelmapUpdate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let tag = ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: xlrec.tsid,
|
||||
dbnode: xlrec.dbid,
|
||||
});
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,274 +0,0 @@
|
||||
//
|
||||
// Restore chunks from S3
|
||||
//
|
||||
// This runs once at Page Server startup. It loads all the "base images" from
|
||||
// S3 into the in-memory page cache. It also initializes the "last valid LSN"
|
||||
// in the page cache to the LSN of the base image, so that when the WAL receiver
|
||||
// is started, it starts streaming from that LSN.
|
||||
//
|
||||
|
||||
use bytes::{Buf, BytesMut};
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::env;
|
||||
use std::fmt;
|
||||
|
||||
use s3::bucket::Bucket;
|
||||
use s3::creds::Credentials;
|
||||
use s3::region::Region;
|
||||
use s3::S3Error;
|
||||
|
||||
use tokio::runtime;
|
||||
|
||||
use futures::future;
|
||||
|
||||
use crate::{page_cache, PageServerConf};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
|
||||
struct Storage {
|
||||
region: Region,
|
||||
credentials: Credentials,
|
||||
bucket: String,
|
||||
}
|
||||
|
||||
pub fn restore_main(conf: &PageServerConf) {
|
||||
// Create a new thread pool
|
||||
let runtime = runtime::Runtime::new().unwrap();
|
||||
|
||||
runtime.block_on(async {
|
||||
let result = restore_chunk(conf).await;
|
||||
|
||||
match result {
|
||||
Ok(_) => {}
|
||||
Err(err) => {
|
||||
error!("S3 error: {}", err);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
//
|
||||
// Restores one chunk from S3.
|
||||
//
|
||||
// 1. Fetch the last base image >= given LSN
|
||||
// 2. Fetch all WAL
|
||||
//
|
||||
// Load it all into the page cache.
|
||||
//
|
||||
async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
let backend = Storage {
|
||||
region: Region::Custom {
|
||||
region: env::var("S3_REGION").unwrap(),
|
||||
endpoint: env::var("S3_ENDPOINT").unwrap(),
|
||||
},
|
||||
credentials: Credentials::new(
|
||||
Some(&env::var("S3_ACCESSKEY").unwrap()),
|
||||
Some(&env::var("S3_SECRET").unwrap()),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.unwrap(),
|
||||
bucket: "zenith-testbucket".to_string(),
|
||||
};
|
||||
|
||||
info!("Restoring from S3...");
|
||||
|
||||
// Create Bucket in REGION for BUCKET
|
||||
let bucket = Bucket::new_with_path_style(&backend.bucket, backend.region, backend.credentials)?;
|
||||
|
||||
// List out contents of directory
|
||||
let results: Vec<s3::serde_types::ListBucketResult> = bucket
|
||||
.list("relationdata/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
|
||||
// TODO: get that from backup
|
||||
let sys_id: u64 = 42;
|
||||
let mut oldest_lsn = 0;
|
||||
let mut slurp_futures: Vec<_> = Vec::new();
|
||||
|
||||
for result in results {
|
||||
for object in result.contents {
|
||||
// Download every relation file, slurping them into memory
|
||||
|
||||
let key = object.key;
|
||||
let relpath = key.strip_prefix("relationdata/").unwrap();
|
||||
|
||||
let parsed = parse_rel_file_path(&relpath);
|
||||
|
||||
match parsed {
|
||||
Ok(p) => {
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
let b = bucket.clone();
|
||||
let f = slurp_base_file(conf, sys_id, b, key.to_string(), p);
|
||||
|
||||
slurp_futures.push(f);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("unrecognized file: {} ({})", relpath, e);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if oldest_lsn == 0 {
|
||||
panic!("no base backup found");
|
||||
}
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
pcache.init_valid_lsn(oldest_lsn);
|
||||
|
||||
info!("{} files to restore...", slurp_futures.len());
|
||||
|
||||
future::join_all(slurp_futures).await;
|
||||
info!("restored!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParsedBaseImageFileName {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u8,
|
||||
pub segno: u32,
|
||||
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(fname)
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode: u32 = relnode_str.parse()?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
segno_match.unwrap().as_str().parse::<u32>()?
|
||||
};
|
||||
|
||||
let lsn_hi: u64 = caps.name("lsnhi").unwrap().as_str().parse()?;
|
||||
let lsn_lo: u64 = caps.name("lsnlo").unwrap().as_str().parse()?;
|
||||
let lsn = lsn_hi << 32 | lsn_lo;
|
||||
|
||||
Ok((relnode, forknum, segno, lsn))
|
||||
}
|
||||
|
||||
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
|
||||
/*
|
||||
* Relation data files can be in one of the following directories:
|
||||
*
|
||||
* global/
|
||||
* shared relations
|
||||
*
|
||||
* base/<db oid>/
|
||||
* regular relations, default tablespace
|
||||
*
|
||||
* pg_tblspc/<tblspc oid>/<tblspc version>/
|
||||
* within a non-default tablespace (the name of the directory
|
||||
* depends on version)
|
||||
*
|
||||
* And the relation data files themselves have a filename like:
|
||||
*
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
})
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split("/");
|
||||
let dbnode_str = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
let dbnode: u32 = dbnode_str.parse()?;
|
||||
let fname = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
if s.next().is_some() {
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
};
|
||||
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
})
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
// TODO
|
||||
Err(FilePathError::new("tablespaces not supported"))
|
||||
} else {
|
||||
Err(FilePathError::new("invalid relation data file name"))
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Load a base file from S3, and insert it into the page cache
|
||||
//
|
||||
async fn slurp_base_file(
|
||||
conf: &PageServerConf,
|
||||
sys_id: u64,
|
||||
bucket: Bucket,
|
||||
s3path: String,
|
||||
parsed: ParsedBaseImageFileName,
|
||||
) {
|
||||
// FIXME: rust-s3 opens a new connection for each request. Should reuse
|
||||
// the reqwest::Client object. But that requires changes to rust-s3 itself.
|
||||
let (data, code) = bucket.get_object(s3path.clone()).await.unwrap();
|
||||
|
||||
trace!("got response: {} on {}", code, &s3path);
|
||||
assert_eq!(200, code);
|
||||
|
||||
let mut bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
|
||||
while bytes.remaining() >= 8192 {
|
||||
let tag = page_cache::BufferTag {
|
||||
rel: page_cache::RelTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
|
||||
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
443
pageserver/src/rocksdb_storage.rs
Normal file
443
pageserver/src/rocksdb_storage.rs
Normal file
@@ -0,0 +1,443 @@
|
||||
//!
|
||||
//! An implementation of the ObjectStore interface, backed by RocksDB
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::object_store::ObjectStore;
|
||||
use crate::repository::RelTag;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct StorageKey {
|
||||
obj_key: ObjectKey,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl StorageKey {
|
||||
/// The first key for a given timeline
|
||||
fn timeline_start(timeline: ZTimelineId) -> Self {
|
||||
Self {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
tag: ObjectTag::TimelineMetadataTag,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// RocksDB very inefficiently delete random record. Instead of it we have to use merge
|
||||
/// filter, which allows to throw away records at LSM merge phase.
|
||||
/// Unfortunately, it is hard (if ever possible) to determine whether version can be removed
|
||||
/// at merge time. Version ca be removed if:
|
||||
/// 1. It is above PITR horizon (we need to get current LSN and gc_horizon from config)
|
||||
/// 2. Page is reconstructed at horizon (all WAL records above horizon are applied and can be removed)
|
||||
///
|
||||
/// So we have GC process which reconstructs pages at horizon and mark deteriorated WAL record
|
||||
/// for deletion. To mark object for deletion we can either set some flag in object itself.
|
||||
/// But it is complicated with new object value format, because RocksDB storage knows nothing about
|
||||
/// this format. Also updating whole record just to set one bit seems to be inefficient in any case.
|
||||
/// This is why we keep keys of marked for deletion versions in HashSet in memory.
|
||||
/// When LSM merge filter found key in this map, it removes it from the set preventing memory overflow.
|
||||
///
|
||||
struct GarbageCollector {
|
||||
garbage: Mutex<HashSet<Vec<u8>>>,
|
||||
}
|
||||
|
||||
impl GarbageCollector {
|
||||
fn new() -> GarbageCollector {
|
||||
GarbageCollector {
|
||||
garbage: Mutex::new(HashSet::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Called by GC to mark version as delete
|
||||
fn mark_for_deletion(&self, key: &[u8]) {
|
||||
let mut garbage = self.garbage.lock().unwrap();
|
||||
garbage.insert(key.to_vec());
|
||||
}
|
||||
|
||||
/// Called by LSM merge filter. If it finds key in the set, then
|
||||
/// it doesn't merge it and removes from this set.
|
||||
fn was_deleted(&self, key: &[u8]) -> bool {
|
||||
let key = key.to_vec();
|
||||
let mut garbage = self.garbage.lock().unwrap();
|
||||
garbage.remove(&key)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RocksObjectStore {
|
||||
_conf: &'static PageServerConf,
|
||||
|
||||
// RocksDB handle
|
||||
db: rocksdb::DB,
|
||||
gc: Arc<GarbageCollector>,
|
||||
}
|
||||
|
||||
impl ObjectStore for RocksObjectStore {
|
||||
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
|
||||
let val = self.db.get(StorageKey::ser(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
})?)?;
|
||||
if let Some(val) = val {
|
||||
Ok(val)
|
||||
} else {
|
||||
bail!("could not find page {:?}", key);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
|
||||
let mut iter = self.db.raw_iterator();
|
||||
let search_key = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
iter.seek(search_key.ser()?);
|
||||
if !iter.valid() {
|
||||
Ok(None)
|
||||
} else {
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
Ok(Some(key.obj_key.clone()))
|
||||
}
|
||||
}
|
||||
|
||||
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
|
||||
self.db.put(
|
||||
StorageKey::ser(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
})?,
|
||||
value,
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
|
||||
self.gc.mark_for_deletion(&StorageKey::ser(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
})?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Iterate through page versions of given page, starting from the given LSN.
|
||||
/// The versions are walked in descending LSN order.
|
||||
fn object_versions<'a>(
|
||||
&'a self,
|
||||
key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
|
||||
let iter = RocksObjectVersionIter::new(&self.db, key, lsn)?;
|
||||
Ok(Box::new(iter))
|
||||
}
|
||||
|
||||
/// Iterate through all timeline objects
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
|
||||
let iter = RocksObjectIter::new(&self.db, timeline, nonrel_only, lsn)?;
|
||||
Ok(Box::new(iter))
|
||||
}
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
///
|
||||
/// TODO: This implementation is very inefficient, it scans
|
||||
/// through all entries in the given database. In practice, this
|
||||
/// is used for CREATE DATABASE, and usually the template database is small.
|
||||
/// But if it's not, this will be slow.
|
||||
fn list_rels(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<RelTag>> {
|
||||
// FIXME: This scans everything. Very slow
|
||||
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
|
||||
let mut search_rel_tag = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: 0u8,
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
loop {
|
||||
let search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::RelationMetadata(search_rel_tag),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
iter.seek(search_key.ser()?);
|
||||
if !iter.valid() {
|
||||
break;
|
||||
}
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
|
||||
if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
|
||||
if spcnode != 0 && rel_tag.spcnode != spcnode
|
||||
|| dbnode != 0 && rel_tag.dbnode != dbnode
|
||||
{
|
||||
break;
|
||||
}
|
||||
if key.lsn <= lsn {
|
||||
// visible in this snapshot
|
||||
rels.insert(rel_tag);
|
||||
}
|
||||
search_rel_tag = rel_tag;
|
||||
// skip to next relation
|
||||
// FIXME: What if relnode is u32::MAX ?
|
||||
search_rel_tag.relnode += 1;
|
||||
} else {
|
||||
// no more relation metadata entries
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Iterate through versions of all objects in a timeline.
|
||||
///
|
||||
/// Returns objects in increasing key-version order.
|
||||
/// Returns all versions up to and including the specified LSN.
|
||||
fn objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
|
||||
let start_key = StorageKey::timeline_start(timeline);
|
||||
let start_key_bytes = StorageKey::ser(&start_key)?;
|
||||
let iter = self.db.iterator(rocksdb::IteratorMode::From(
|
||||
&start_key_bytes,
|
||||
rocksdb::Direction::Forward,
|
||||
));
|
||||
|
||||
Ok(Box::new(RocksObjects {
|
||||
iter,
|
||||
timeline,
|
||||
lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
fn compact(&self) {
|
||||
self.db.compact_range::<&[u8], &[u8]>(None, None);
|
||||
}
|
||||
}
|
||||
|
||||
impl RocksObjectStore {
|
||||
/// Open a RocksDB database.
|
||||
pub fn open(conf: &'static PageServerConf) -> Result<RocksObjectStore> {
|
||||
let opts = Self::get_rocksdb_opts();
|
||||
let obj_store = Self::new(conf, opts)?;
|
||||
Ok(obj_store)
|
||||
}
|
||||
|
||||
/// Create a new, empty RocksDB database.
|
||||
pub fn create(conf: &'static PageServerConf) -> Result<RocksObjectStore> {
|
||||
let path = conf.workdir.join("rocksdb-storage");
|
||||
std::fs::create_dir(&path)?;
|
||||
|
||||
let mut opts = Self::get_rocksdb_opts();
|
||||
opts.create_if_missing(true);
|
||||
opts.set_error_if_exists(true);
|
||||
let obj_store = Self::new(conf, opts)?;
|
||||
Ok(obj_store)
|
||||
}
|
||||
|
||||
fn new(conf: &'static PageServerConf, mut opts: rocksdb::Options) -> Result<RocksObjectStore> {
|
||||
let path = conf.workdir.join("rocksdb-storage");
|
||||
let gc = Arc::new(GarbageCollector::new());
|
||||
let gc_ref = gc.clone();
|
||||
opts.set_compaction_filter("ttl", move |_level: u32, key: &[u8], _val: &[u8]| {
|
||||
if gc_ref.was_deleted(key) {
|
||||
rocksdb::compaction_filter::Decision::Remove
|
||||
} else {
|
||||
rocksdb::compaction_filter::Decision::Keep
|
||||
}
|
||||
});
|
||||
let db = rocksdb::DB::open(&opts, &path)?;
|
||||
let obj_store = RocksObjectStore {
|
||||
_conf: conf,
|
||||
db,
|
||||
gc,
|
||||
};
|
||||
Ok(obj_store)
|
||||
}
|
||||
|
||||
/// common options used by `open` and `create`
|
||||
fn get_rocksdb_opts() -> rocksdb::Options {
|
||||
let mut opts = rocksdb::Options::default();
|
||||
opts.set_use_fsync(true);
|
||||
opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
|
||||
opts
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `object_versions`. Returns all page versions of a given block, in
|
||||
/// reverse LSN order.
|
||||
///
|
||||
struct RocksObjectVersionIter<'a> {
|
||||
obj_key: ObjectKey,
|
||||
dbiter: rocksdb::DBRawIterator<'a>,
|
||||
first_call: bool,
|
||||
}
|
||||
impl<'a> RocksObjectVersionIter<'a> {
|
||||
fn new(
|
||||
db: &'a rocksdb::DB,
|
||||
obj_key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<RocksObjectVersionIter<'a>> {
|
||||
let key = StorageKey {
|
||||
obj_key: obj_key.clone(),
|
||||
lsn,
|
||||
};
|
||||
let mut dbiter = db.raw_iterator();
|
||||
dbiter.seek_for_prev(StorageKey::ser(&key)?); // locate last entry
|
||||
Ok(RocksObjectVersionIter {
|
||||
first_call: true,
|
||||
obj_key: obj_key.clone(),
|
||||
dbiter,
|
||||
})
|
||||
}
|
||||
}
|
||||
impl<'a> Iterator for RocksObjectVersionIter<'a> {
|
||||
type Item = (Lsn, Vec<u8>);
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
if self.first_call {
|
||||
self.first_call = false;
|
||||
} else {
|
||||
self.dbiter.prev(); // walk backwards
|
||||
}
|
||||
|
||||
if !self.dbiter.valid() {
|
||||
return None;
|
||||
}
|
||||
let key = StorageKey::des(self.dbiter.key().unwrap()).unwrap();
|
||||
if key.obj_key.tag != self.obj_key.tag {
|
||||
return None;
|
||||
}
|
||||
let val = self.dbiter.value().unwrap();
|
||||
let result = val.to_vec();
|
||||
|
||||
Some((key.lsn, result))
|
||||
}
|
||||
}
|
||||
|
||||
struct RocksObjects<'r> {
|
||||
iter: rocksdb::DBIterator<'r>,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl<'r> Iterator for RocksObjects<'r> {
|
||||
// TODO consider returning Box<[u8]>
|
||||
type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_result().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r> RocksObjects<'r> {
|
||||
fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
|
||||
for (key_bytes, v) in &mut self.iter {
|
||||
let key = StorageKey::des(&key_bytes)?;
|
||||
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if key.lsn > self.lsn {
|
||||
// TODO can speed up by seeking iterator
|
||||
continue;
|
||||
}
|
||||
|
||||
return Ok(Some((key.obj_key.tag, key.lsn, v.to_vec())));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
|
||||
///
|
||||
struct RocksObjectIter<'a> {
|
||||
timeline: ZTimelineId,
|
||||
key: StorageKey,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
dbiter: rocksdb::DBRawIterator<'a>,
|
||||
}
|
||||
impl<'a> RocksObjectIter<'a> {
|
||||
fn new(
|
||||
db: &'a rocksdb::DB,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<RocksObjectIter<'a>> {
|
||||
let key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
tag: ObjectTag::FirstTag,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let dbiter = db.raw_iterator();
|
||||
Ok(RocksObjectIter {
|
||||
key,
|
||||
timeline,
|
||||
nonrel_only,
|
||||
lsn,
|
||||
dbiter,
|
||||
})
|
||||
}
|
||||
}
|
||||
impl<'a> Iterator for RocksObjectIter<'a> {
|
||||
type Item = ObjectTag;
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
loop {
|
||||
self.dbiter.seek(StorageKey::ser(&self.key).unwrap());
|
||||
if !self.dbiter.valid() {
|
||||
return None;
|
||||
}
|
||||
let key = StorageKey::des(self.dbiter.key().unwrap()).unwrap();
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
// End of this timeline
|
||||
return None;
|
||||
}
|
||||
self.key = key.clone();
|
||||
self.key.lsn = Lsn(u64::MAX); // next seek should skip all versions
|
||||
if key.lsn <= self.lsn {
|
||||
// visible in this snapshot
|
||||
if self.nonrel_only {
|
||||
match key.obj_key.tag {
|
||||
ObjectTag::RelationMetadata(_) => return None,
|
||||
ObjectTag::RelationBuffer(_) => return None,
|
||||
_ => return Some(key.obj_key.tag),
|
||||
}
|
||||
} else {
|
||||
return Some(key.obj_key.tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -54,14 +54,14 @@ impl Events {
|
||||
thread::spawn(move || {
|
||||
let stdin = io::stdin();
|
||||
for evt in stdin.keys() {
|
||||
if let Ok(key) = evt {
|
||||
if let Err(err) = tx.send(Event::Input(key)) {
|
||||
eprintln!("{}", err);
|
||||
return;
|
||||
}
|
||||
if !ignore_exit_key.load(Ordering::Relaxed) && key == config.exit_key {
|
||||
return;
|
||||
}
|
||||
// This will panic if stdin returns EOF.
|
||||
let key = evt.unwrap();
|
||||
if let Err(err) = tx.send(Event::Input(key)) {
|
||||
eprintln!("{}", err);
|
||||
return;
|
||||
}
|
||||
if !ignore_exit_key.load(Ordering::Relaxed) && key == config.exit_key {
|
||||
return;
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
@@ -1,9 +1,15 @@
|
||||
//!
|
||||
//! WAL decoder. For each WAL record, it decodes the record to figure out which data blocks
|
||||
//! the record affects, to add the records to the page cache.
|
||||
//!
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use postgres_ffi::xlog_utils::XLogRecord;
|
||||
use postgres_ffi::*;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::XLogLongPageHeaderData;
|
||||
use postgres_ffi::XLogPageHeaderData;
|
||||
use postgres_ffi::XLogRecord;
|
||||
use std::cmp::min;
|
||||
use std::str;
|
||||
use thiserror::Error;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
@@ -14,37 +20,6 @@ pub type OffsetNumber = u16;
|
||||
pub type MultiXactId = TransactionId;
|
||||
pub type MultiXactOffset = u32;
|
||||
pub type MultiXactStatus = u32;
|
||||
pub type TimeLineID = u32;
|
||||
pub type PgTime = i64;
|
||||
|
||||
// From PostgreSQL headers
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XLogPageHeaderData {
|
||||
xlp_magic: u16, /* magic value for correctness checks */
|
||||
xlp_info: u16, /* flag bits, see below */
|
||||
xlp_tli: TimeLineID, /* TimeLineID of first record on page */
|
||||
xlp_pageaddr: u64, /* XLOG address of this page */
|
||||
xlp_rem_len: u32, /* total len of remaining data for record */
|
||||
}
|
||||
|
||||
// FIXME: this assumes MAXIMUM_ALIGNOF 8. There are 4 padding bytes at end
|
||||
#[allow(non_upper_case_globals)]
|
||||
const SizeOfXLogShortPHD: usize = 2 + 2 + 4 + 8 + 4 + 4;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XLogLongPageHeaderData {
|
||||
std: XLogPageHeaderData, /* standard header fields */
|
||||
xlp_sysid: u64, /* system identifier from pg_control */
|
||||
xlp_seg_size: u32, /* just as a cross-check */
|
||||
xlp_xlog_blcksz: u32, /* just as a cross-check */
|
||||
}
|
||||
|
||||
// FIXME: this assumes MAXIMUM_ALIGNOF 8.
|
||||
#[allow(non_upper_case_globals)]
|
||||
const SizeOfXLogLongPHD: usize = (2 + 2 + 4 + 8 + 4) + 4 + 8 + 4 + 4;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub struct WalStreamDecoder {
|
||||
@@ -84,13 +59,6 @@ impl WalStreamDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_position(&mut self, lsn: Lsn) {
|
||||
self.lsn = lsn;
|
||||
self.contlen = 0;
|
||||
self.padlen = 0;
|
||||
self.inputbuf.clear();
|
||||
}
|
||||
|
||||
pub fn feed_bytes(&mut self, buf: &[u8]) {
|
||||
self.inputbuf.extend_from_slice(buf);
|
||||
}
|
||||
@@ -99,7 +67,7 @@ impl WalStreamDecoder {
|
||||
/// decoder so far.
|
||||
///
|
||||
/// Returns one of the following:
|
||||
/// Ok((u64, Bytes)): a tuple containing the LSN of next record, and the record itself
|
||||
/// Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself
|
||||
/// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
|
||||
/// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
|
||||
///
|
||||
@@ -109,11 +77,12 @@ impl WalStreamDecoder {
|
||||
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
||||
// parse long header
|
||||
|
||||
if self.inputbuf.remaining() < SizeOfXLogLongPHD {
|
||||
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = self.decode_XLogLongPageHeaderData();
|
||||
let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf);
|
||||
|
||||
if hdr.std.xlp_pageaddr != self.lsn.0 {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog segment header".into(),
|
||||
@@ -122,14 +91,15 @@ impl WalStreamDecoder {
|
||||
}
|
||||
// TODO: verify the remaining fields in the header
|
||||
|
||||
self.lsn += SizeOfXLogLongPHD as u64;
|
||||
self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
|
||||
continue;
|
||||
} else if self.lsn.block_offset() == 0 {
|
||||
if self.inputbuf.remaining() < SizeOfXLogShortPHD {
|
||||
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = self.decode_XLogPageHeaderData();
|
||||
let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf);
|
||||
|
||||
if hdr.xlp_pageaddr != self.lsn.0 {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog page header".into(),
|
||||
@@ -138,7 +108,7 @@ impl WalStreamDecoder {
|
||||
}
|
||||
// TODO: verify the remaining fields in the header
|
||||
|
||||
self.lsn += SizeOfXLogShortPHD as u64;
|
||||
self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
|
||||
continue;
|
||||
} else if self.padlen > 0 {
|
||||
if self.inputbuf.remaining() < self.padlen as usize {
|
||||
@@ -159,7 +129,7 @@ impl WalStreamDecoder {
|
||||
// read xl_tot_len FIXME: assumes little-endian
|
||||
self.startlsn = self.lsn;
|
||||
let xl_tot_len = self.inputbuf.get_u32_le();
|
||||
if xl_tot_len < SizeOfXLogRecord {
|
||||
if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
|
||||
return Err(WalDecodeError {
|
||||
msg: format!("invalid xl_tot_len {}", xl_tot_len),
|
||||
lsn: self.lsn,
|
||||
@@ -218,40 +188,6 @@ impl WalStreamDecoder {
|
||||
|
||||
// deal with xlog_switch records
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
fn decode_XLogPageHeaderData(&mut self) -> XLogPageHeaderData {
|
||||
let buf = &mut self.inputbuf;
|
||||
|
||||
// FIXME: Assume little-endian
|
||||
|
||||
let hdr: XLogPageHeaderData = XLogPageHeaderData {
|
||||
xlp_magic: buf.get_u16_le(),
|
||||
xlp_info: buf.get_u16_le(),
|
||||
xlp_tli: buf.get_u32_le(),
|
||||
xlp_pageaddr: buf.get_u64_le(),
|
||||
xlp_rem_len: buf.get_u32_le(),
|
||||
};
|
||||
// 4 bytes of padding, on 64-bit systems
|
||||
buf.advance(4);
|
||||
|
||||
// FIXME: check that hdr.xlp_rem_len matches self.contlen
|
||||
//println!("next xlog page (xlp_rem_len: {})", hdr.xlp_rem_len);
|
||||
|
||||
hdr
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
fn decode_XLogLongPageHeaderData(&mut self) -> XLogLongPageHeaderData {
|
||||
let hdr: XLogLongPageHeaderData = XLogLongPageHeaderData {
|
||||
std: self.decode_XLogPageHeaderData(),
|
||||
xlp_sysid: self.inputbuf.get_u64_le(),
|
||||
xlp_seg_size: self.inputbuf.get_u32_le(),
|
||||
xlp_xlog_blcksz: self.inputbuf.get_u32_le(),
|
||||
};
|
||||
|
||||
hdr
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
@@ -273,7 +209,7 @@ pub struct DecodedBkpBlock {
|
||||
/* Information on full-page image, if any */
|
||||
has_image: bool, /* has image, even for consistency checking */
|
||||
pub apply_image: bool, /* has image that should be restored */
|
||||
pub will_init: bool,
|
||||
pub will_init: bool, /* record doesn't need previous page version to apply */
|
||||
//char *bkp_image;
|
||||
hole_offset: u16,
|
||||
hole_length: u16,
|
||||
@@ -309,10 +245,8 @@ impl DecodedBkpBlock {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
const SizeOfXLogRecord: u32 = 24;
|
||||
|
||||
pub struct DecodedWALRecord {
|
||||
pub xl_xid: TransactionId,
|
||||
pub xl_info: u8,
|
||||
pub xl_rmid: u8,
|
||||
pub record: Bytes, // raw XLogRecord
|
||||
@@ -356,9 +290,7 @@ pub struct XlSmgrTruncate {
|
||||
}
|
||||
|
||||
impl XlSmgrTruncate {
|
||||
pub fn decode(decoded: &DecodedWALRecord) -> XlSmgrTruncate {
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance((SizeOfXLogRecord + 2) as usize);
|
||||
pub fn decode(buf: &mut Bytes) -> XlSmgrTruncate {
|
||||
XlSmgrTruncate {
|
||||
blkno: buf.get_u32_le(),
|
||||
rnode: RelFileNode {
|
||||
@@ -381,9 +313,7 @@ pub struct XlCreateDatabase {
|
||||
}
|
||||
|
||||
impl XlCreateDatabase {
|
||||
pub fn decode(decoded: &DecodedWALRecord) -> XlCreateDatabase {
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance((SizeOfXLogRecord + 2) as usize);
|
||||
pub fn decode(buf: &mut Bytes) -> XlCreateDatabase {
|
||||
XlCreateDatabase {
|
||||
db_id: buf.get_u32_le(),
|
||||
tablespace_id: buf.get_u32_le(),
|
||||
@@ -469,6 +399,103 @@ impl XlHeapUpdate {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Note: Parsing some fields is missing, because they're not needed.
|
||||
///
|
||||
/// This is similar to the xl_xact_parsed_commit and
|
||||
/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
|
||||
/// struct for commits and aborts.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
pub struct XlXactParsedRecord {
|
||||
pub xid: TransactionId,
|
||||
pub info: u8,
|
||||
pub xact_time: TimestampTz,
|
||||
pub xinfo: u32,
|
||||
|
||||
pub db_id: Oid, /* MyDatabaseId */
|
||||
pub ts_id: Oid, /* MyDatabaseTableSpace */
|
||||
|
||||
pub subxacts: Vec<TransactionId>,
|
||||
|
||||
pub xnodes: Vec<RelFileNode>,
|
||||
}
|
||||
|
||||
impl XlXactParsedRecord {
|
||||
/// Decode a XLOG_XACT_COMMIT/ABORT/COMMIT_PREPARED/ABORT_PREPARED
|
||||
/// record. This should agree with the ParseCommitRecord and ParseAbortRecord
|
||||
/// functions in PostgreSQL (in src/backend/access/rmgr/xactdesc.c)
|
||||
pub fn decode(buf: &mut Bytes, mut xid: TransactionId, xl_info: u8) -> XlXactParsedRecord {
|
||||
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
// The record starts with time of commit/abort
|
||||
let xact_time = buf.get_i64_le();
|
||||
let xinfo;
|
||||
if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
} else {
|
||||
xinfo = 0;
|
||||
}
|
||||
let db_id;
|
||||
let ts_id;
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
db_id = buf.get_u32_le();
|
||||
ts_id = buf.get_u32_le();
|
||||
} else {
|
||||
db_id = 0;
|
||||
ts_id = 0;
|
||||
}
|
||||
let mut subxacts = Vec::<TransactionId>::new();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
subxacts.push(subxact);
|
||||
}
|
||||
}
|
||||
let mut xnodes = Vec::<RelFileNode>::new();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
xnodes.push(RelFileNode {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
});
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
|
||||
xid = buf.get_u32_le();
|
||||
trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
|
||||
}
|
||||
XlXactParsedRecord {
|
||||
xid,
|
||||
info,
|
||||
xact_time,
|
||||
xinfo,
|
||||
db_id,
|
||||
ts_id,
|
||||
subxacts,
|
||||
xnodes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct MultiXactMember {
|
||||
@@ -515,14 +542,14 @@ impl XlMultiXactCreate {
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlMultiXactTruncate {
|
||||
oldest_multi_db: Oid,
|
||||
pub oldest_multi_db: Oid,
|
||||
/* to-be-truncated range of multixact offsets */
|
||||
start_trunc_off: MultiXactId, /* just for completeness' sake */
|
||||
end_trunc_off: MultiXactId,
|
||||
pub start_trunc_off: MultiXactId, /* just for completeness' sake */
|
||||
pub end_trunc_off: MultiXactId,
|
||||
|
||||
/* to-be-truncated range of multixact members */
|
||||
start_trunc_memb: MultiXactOffset,
|
||||
end_trunc_memb: MultiXactOffset,
|
||||
pub start_trunc_memb: MultiXactOffset,
|
||||
pub end_trunc_memb: MultiXactOffset,
|
||||
}
|
||||
|
||||
impl XlMultiXactTruncate {
|
||||
@@ -537,8 +564,7 @@ impl XlMultiXactTruncate {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Routines to decode a WAL record and figure out which blocks are modified
|
||||
/// Main routine to decode a WAL record and figure out which blocks are modified
|
||||
//
|
||||
// See xlogrecord.h for details
|
||||
// The overall layout of an XLOG record is:
|
||||
@@ -556,7 +582,7 @@ impl XlMultiXactTruncate {
|
||||
// block data
|
||||
// ...
|
||||
// main data
|
||||
pub fn decode_wal_record(checkpoint: &mut CheckPoint, record: Bytes) -> DecodedWALRecord {
|
||||
pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
let mut rnode_spcnode: u32 = 0;
|
||||
let mut rnode_dbnode: u32 = 0;
|
||||
let mut rnode_relnode: u32 = 0;
|
||||
@@ -574,15 +600,10 @@ pub fn decode_wal_record(checkpoint: &mut CheckPoint, record: Bytes) -> DecodedW
|
||||
xlogrec.xl_rmid,
|
||||
xlogrec.xl_info
|
||||
);
|
||||
if xlogrec.xl_xid > checkpoint.nextXid.value as u32 {
|
||||
// TODO: handle XID wraparound
|
||||
checkpoint.nextXid = FullTransactionId {
|
||||
value: (checkpoint.nextXid.value & 0xFFFFFFFF00000000) | xlogrec.xl_xid as u64,
|
||||
};
|
||||
}
|
||||
let remaining = xlogrec.xl_tot_len - SizeOfXLogRecord;
|
||||
|
||||
if buf.remaining() != remaining as usize {
|
||||
let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD;
|
||||
|
||||
if buf.remaining() != remaining {
|
||||
//TODO error
|
||||
}
|
||||
|
||||
@@ -790,194 +811,10 @@ pub fn decode_wal_record(checkpoint: &mut CheckPoint, record: Bytes) -> DecodedW
|
||||
assert_eq!(buf.remaining(), main_data_len as usize);
|
||||
}
|
||||
|
||||
//5. Handle special CLOG and XACT records
|
||||
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = buf.get_i32_le() as u32;
|
||||
blk.will_init = true;
|
||||
trace!("RM_CLOG_ID updates block {}", blk.blkno);
|
||||
blocks.push(blk);
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
|
||||
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
|
||||
xlogrec.xl_prev & 0xffffffff,
|
||||
xlogrec.xl_xid,
|
||||
blk.blkno,
|
||||
main_data_len
|
||||
);
|
||||
blocks.push(blk);
|
||||
}
|
||||
//parse commit record to extract subtrans entries
|
||||
// xl_xact_commit starts with time of commit
|
||||
let _xact_time = buf.get_i64_le();
|
||||
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
let mut prev_blkno = u32::MAX;
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if prev_blkno != blkno {
|
||||
prev_blkno = blkno;
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blocks.push(blk);
|
||||
}
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO handle this too?
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
|
||||
let xid = buf.get_u32_le();
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
blocks.push(blk);
|
||||
trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
|
||||
//TODO handle this to be able to restore pg_twophase on node start
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT || info == pg_constants::XLOG_XACT_ABORT_PREPARED {
|
||||
if info == pg_constants::XLOG_XACT_ABORT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
trace!(
|
||||
"XLOG_XACT_ABORT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
|
||||
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
|
||||
xlogrec.xl_prev & 0xffffffff,
|
||||
xlogrec.xl_xid,
|
||||
blk.blkno,
|
||||
main_data_len
|
||||
);
|
||||
blocks.push(blk);
|
||||
}
|
||||
//parse abort record to extract subtrans entries
|
||||
// xl_xact_abort starts with time of commit
|
||||
let _xact_time = buf.get_i64_le();
|
||||
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
let mut prev_blkno = u32::MAX;
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if prev_blkno != blkno {
|
||||
prev_blkno = blkno;
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blocks.push(blk);
|
||||
}
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO save these too
|
||||
trace!(
|
||||
"XLOG_XACT_ABORT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
|
||||
let xid = buf.get_u32_le();
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
blocks.push(blk);
|
||||
trace!("XLOG_XACT_ABORT-XACT_XINFO_HAS_TWOPHASE");
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_TWOPHASE_FORKNUM;
|
||||
blk.blkno = xlogrec.xl_xid;
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
info!("Prepare transaction {}", xlogrec.xl_xid);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_DBASE_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_DBASE_CREATE {
|
||||
//buf points to main_data
|
||||
let db_id = buf.get_u32_le();
|
||||
let tablespace_id = buf.get_u32_le();
|
||||
let src_db_id = buf.get_u32_le();
|
||||
let src_tablespace_id = buf.get_u32_le();
|
||||
trace!(
|
||||
"XLOG_DBASE_CREATE tablespace_id/db_id {}/{} src_db_id {}/{}",
|
||||
tablespace_id,
|
||||
db_id,
|
||||
src_tablespace_id,
|
||||
src_db_id
|
||||
);
|
||||
// in postgres it is implemented as copydir
|
||||
// we need to copy all pages in page_cache
|
||||
} else {
|
||||
trace!("XLOG_DBASE_DROP is not handled yet");
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_TBLSPC_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_TBLSPC_CREATE {
|
||||
//buf points to main_data
|
||||
let ts_id = buf.get_u32_le();
|
||||
let ts_path = str::from_utf8(&buf).unwrap();
|
||||
trace!("XLOG_TBLSPC_CREATE ts_id {} ts_path {}", ts_id, ts_path);
|
||||
} else {
|
||||
trace!("XLOG_TBLSPC_DROP is not handled yet");
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
// 5. Handle a few special record types that modify blocks without registering
|
||||
// them with the standard mechanism.
|
||||
if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = XlHeapInsert::decode(&mut buf);
|
||||
@@ -1031,7 +868,7 @@ pub fn decode_wal_record(checkpoint: &mut CheckPoint, record: Bytes) -> DecodedW
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = XlHeapMultiInsert::decode(&mut buf);
|
||||
if (xlrec.flags
|
||||
@@ -1049,105 +886,10 @@ pub fn decode_wal_record(checkpoint: &mut CheckPoint, record: Bytes) -> DecodedW
|
||||
blocks.push(blk);
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_MXACT_OFFSETS_FORKNUM;
|
||||
blk.blkno = buf.get_u32_le();
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_MXACT_MEMBERS_FORKNUM;
|
||||
blk.blkno = buf.get_u32_le();
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
// Update offset page
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.blkno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
blk.forknum = pg_constants::PG_MXACT_OFFSETS_FORKNUM;
|
||||
blocks.push(blk);
|
||||
let first_mbr_blkno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno =
|
||||
(xlrec.moff + xlrec.nmembers - 1) / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
for blkno in first_mbr_blkno..=last_mbr_blkno {
|
||||
// Update members page
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_MXACT_MEMBERS_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blocks.push(blk);
|
||||
}
|
||||
if xlrec.mid > checkpoint.nextMulti {
|
||||
checkpoint.nextMulti = xlrec.mid;
|
||||
}
|
||||
if xlrec.moff > checkpoint.nextMultiOffset {
|
||||
checkpoint.nextMultiOffset = xlrec.moff;
|
||||
}
|
||||
let max_xid = xlrec
|
||||
.members
|
||||
.iter()
|
||||
.fold(checkpoint.nextXid.value as u32, |acc, mbr| {
|
||||
if mbr.xid > acc {
|
||||
mbr.xid
|
||||
} else {
|
||||
acc
|
||||
}
|
||||
});
|
||||
checkpoint.nextXid = FullTransactionId {
|
||||
value: (checkpoint.nextXid.value & 0xFFFFFFFF00000000) | max_xid as u64,
|
||||
};
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||
checkpoint.oldestXid = xlrec.end_trunc_off;
|
||||
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
let first_off_blkno =
|
||||
xlrec.start_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let last_off_blkno =
|
||||
xlrec.end_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
for blkno in first_off_blkno..last_off_blkno {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_MXACT_OFFSETS_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
}
|
||||
let first_mbr_blkno =
|
||||
xlrec.start_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno =
|
||||
xlrec.end_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
for blkno in first_mbr_blkno..last_mbr_blkno {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_MXACT_MEMBERS_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
}
|
||||
} else {
|
||||
assert!(false);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_FILENODEMAP_FORKNUM;
|
||||
blk.rnode_spcnode = xlrec.tsid;
|
||||
blk.rnode_dbnode = xlrec.dbid;
|
||||
blk.rnode_relnode = 0;
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XLOG_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
if next_oid > checkpoint.nextOid {
|
||||
checkpoint.nextOid = next_oid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DecodedWALRecord {
|
||||
xl_xid: xlogrec.xl_xid,
|
||||
xl_info: xlogrec.xl_info,
|
||||
xl_rmid: xlogrec.xl_rmid,
|
||||
record,
|
||||
|
||||
@@ -1,17 +1,16 @@
|
||||
//!
|
||||
//! WAL receiver
|
||||
//!
|
||||
//! The WAL receiver connects to the WAL safekeeper service, and streams WAL.
|
||||
//! For each WAL record, it decodes the record to figure out which data blocks
|
||||
//! the record affects, and adds the records to the page cache.
|
||||
//! WAL receiver connects to the WAL safekeeper service,
|
||||
//! streams WAL, decodes records and saves them in page cache.
|
||||
//!
|
||||
//! We keep one WAL receiver active per timeline.
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::page_cache;
|
||||
use crate::repository::*;
|
||||
use crate::restore_local_repo;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Error;
|
||||
use anyhow::{Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
@@ -21,6 +20,7 @@ use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
@@ -121,7 +121,10 @@ fn walreceiver_main(
|
||||
) -> Result<(), Error> {
|
||||
// Connect to the database in replication mode.
|
||||
info!("connecting to {:?}", wal_producer_connstr);
|
||||
let connect_cfg = format!("{} replication=true", wal_producer_connstr);
|
||||
let connect_cfg = format!(
|
||||
"{} application_name=pageserver replication=true",
|
||||
wal_producer_connstr
|
||||
);
|
||||
|
||||
let mut rclient = Client::connect(&connect_cfg, NoTls)?;
|
||||
info!("connected!");
|
||||
@@ -140,18 +143,13 @@ fn walreceiver_main(
|
||||
// If we had previously received WAL up to some point in the middle of a WAL record, we
|
||||
// better start from the end of last full WAL record, not in the middle of one. Hence,
|
||||
// use 'last_record_lsn' rather than 'last_valid_lsn' here.
|
||||
let last_rec_lsn = timeline.get_last_record_lsn();
|
||||
let mut last_rec_lsn = timeline.get_last_record_lsn();
|
||||
let mut startpoint = last_rec_lsn;
|
||||
|
||||
if startpoint == Lsn(0) {
|
||||
error!("No previous WAL position");
|
||||
}
|
||||
|
||||
startpoint = Lsn::max(
|
||||
startpoint,
|
||||
Lsn(end_of_wal.0 & !(pg_constants::WAL_SEGMENT_SIZE as u64 - 1)),
|
||||
);
|
||||
|
||||
// There might be some padding after the last full record, skip it.
|
||||
//
|
||||
// FIXME: It probably would be better to always start streaming from the beginning
|
||||
@@ -171,22 +169,20 @@ fn walreceiver_main(
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let mut checkpoint = CheckPoint::new(startpoint.0, identify.timeline);
|
||||
let checkpoint_tag = BufferTag::fork(pg_constants::PG_CHECKPOINT_FORKNUM);
|
||||
if let Some(checkpoint_bytes) = timeline.get_page_image(checkpoint_tag, Lsn(0))? {
|
||||
checkpoint = decode_checkpoint(checkpoint_bytes)?;
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
} else {
|
||||
error!("No checkpoint record was found in reposistory");
|
||||
}
|
||||
let checkpoint_bytes =
|
||||
timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, startpoint, false)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
|
||||
while let Some(replication_message) = physical_stream.next()? {
|
||||
match replication_message {
|
||||
let status_update = match replication_message {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
let prev_last_rec_lsn = last_rec_lsn;
|
||||
|
||||
write_wal_file(startlsn, timelineid, pg_constants::WAL_SEGMENT_SIZE, data)?;
|
||||
|
||||
@@ -195,17 +191,28 @@ fn walreceiver_main(
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let old_checkpoint_bytes = encode_checkpoint(checkpoint);
|
||||
let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
|
||||
timeline.save_decoded_record(decoded, recdata, lsn)?;
|
||||
// Save old checkpoint value to compare with it after decoding WAL record
|
||||
let old_checkpoint_bytes = checkpoint.encode();
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
restore_local_repo::save_decoded_record(
|
||||
&mut checkpoint,
|
||||
&*timeline,
|
||||
&decoded,
|
||||
recdata,
|
||||
lsn,
|
||||
)?;
|
||||
last_rec_lsn = lsn;
|
||||
|
||||
let new_checkpoint_bytes = encode_checkpoint(checkpoint);
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
// Check if checkpoint data was updated by save_decoded_record
|
||||
if new_checkpoint_bytes != old_checkpoint_bytes {
|
||||
timeline.put_page_image(checkpoint_tag, Lsn(0), new_checkpoint_bytes);
|
||||
timeline.put_page_image(
|
||||
ObjectTag::Checkpoint,
|
||||
lsn,
|
||||
new_checkpoint_bytes,
|
||||
false,
|
||||
)?;
|
||||
}
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
}
|
||||
|
||||
// Update the last_valid LSN value in the page cache one more time. We updated
|
||||
@@ -216,16 +223,46 @@ fn walreceiver_main(
|
||||
// flush ptr.
|
||||
timeline.advance_last_valid_lsn(endlsn);
|
||||
|
||||
// Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
|
||||
// "checkpoint" the repository to flush all the changes from WAL we've processed
|
||||
// so far to disk. After this, we don't need the original WAL anymore, and it
|
||||
// can be removed. This is probably too aggressive for production, but it's useful
|
||||
// to expose bugs now.
|
||||
//
|
||||
// TODO: We don't actually dare to remove the WAL. It's useful for debugging,
|
||||
// and we might it for logical decoding other things in the future. Although
|
||||
// we should also be able to fetch it back from the WAL safekeepers or S3 if
|
||||
// needed.
|
||||
if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
!= last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
{
|
||||
info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
|
||||
let (oldest_segno, newest_segno) = find_wal_file_range(
|
||||
timelineid,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
last_rec_lsn,
|
||||
)?;
|
||||
|
||||
if newest_segno - oldest_segno >= 10 {
|
||||
timeline.checkpoint()?;
|
||||
|
||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {}", endlsn);
|
||||
caught_up = true;
|
||||
}
|
||||
|
||||
Some(endlsn)
|
||||
}
|
||||
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
let wal_end = keepalive.wal_end();
|
||||
let timestamp = keepalive.timestamp();
|
||||
let reply_requested: bool = keepalive.reply() != 0;
|
||||
let reply_requested = keepalive.reply() != 0;
|
||||
|
||||
trace!(
|
||||
"received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})",
|
||||
@@ -233,25 +270,71 @@ fn walreceiver_main(
|
||||
timestamp,
|
||||
reply_requested,
|
||||
);
|
||||
if reply_requested {
|
||||
// TODO: More thought should go into what values are sent here.
|
||||
let last_lsn = PgLsn::from(u64::from(timeline.get_last_valid_lsn()));
|
||||
let write_lsn = last_lsn;
|
||||
let flush_lsn = last_lsn;
|
||||
let apply_lsn = PgLsn::from(0);
|
||||
let ts = SystemTime::now();
|
||||
const NO_REPLY: u8 = 0u8;
|
||||
|
||||
physical_stream
|
||||
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
|
||||
if reply_requested {
|
||||
Some(timeline.get_last_valid_lsn())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
|
||||
_ => None,
|
||||
};
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
// TODO: More thought should go into what values are sent here.
|
||||
let last_lsn = PgLsn::from(u64::from(last_lsn));
|
||||
let write_lsn = last_lsn;
|
||||
let flush_lsn = last_lsn;
|
||||
let apply_lsn = PgLsn::from(0);
|
||||
let ts = SystemTime::now();
|
||||
const NO_REPLY: u8 = 0;
|
||||
|
||||
physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_wal_file_range(
|
||||
timeline: ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
written_upto: Lsn,
|
||||
) -> Result<(u64, u64)> {
|
||||
let written_upto_segno = written_upto.segment_number(wal_seg_size);
|
||||
|
||||
let mut oldest_segno = written_upto_segno;
|
||||
let mut newest_segno = written_upto_segno;
|
||||
// Scan the wal directory, and count how many WAL filed we could remove
|
||||
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
|
||||
for entry in fs::read_dir(wal_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let filename = path.file_name().unwrap().to_str().unwrap();
|
||||
|
||||
if IsXLogFileName(filename) {
|
||||
let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
|
||||
|
||||
if segno > written_upto_segno {
|
||||
// that's strange.
|
||||
warn!("there is a WAL file from future at {}", path.display());
|
||||
continue;
|
||||
}
|
||||
|
||||
oldest_segno = min(oldest_segno, segno);
|
||||
newest_segno = max(newest_segno, segno);
|
||||
}
|
||||
}
|
||||
// FIXME: would be good to assert that there are no gaps in the WAL files
|
||||
|
||||
Ok((oldest_segno, newest_segno))
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
///
|
||||
/// See the [postgres docs] for more details.
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
//!
|
||||
//! WAL redo
|
||||
//! WAL redo. This service runs PostgreSQL in a special wal_redo mode
|
||||
//! to apply given WAL records over an old page image and return new page image.
|
||||
//!
|
||||
//! We rely on Postgres to perform WAL redo for us. We launch a
|
||||
//! postgres process in special "wal redo" mode that's similar to
|
||||
//! single-user mode. We then pass the the previous page image, if any,
|
||||
//! single-user mode. We then pass the previous page image, if any,
|
||||
//! and all the WAL records we want to apply, to the postgres
|
||||
//! process. Then we get the page image back. Communication with the
|
||||
//! postgres process happens via stdin/stdout
|
||||
@@ -17,15 +18,15 @@
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use std::assert;
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::Error;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Stdio;
|
||||
use std::sync::mpsc;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
@@ -33,15 +34,18 @@ use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::process::{ChildStdin, ChildStdout, Command};
|
||||
use tokio::time::timeout;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::repository::BufferTag;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder::XlXactParsedRecord;
|
||||
use crate::waldecoder::{MultiXactId, XlMultiXactCreate};
|
||||
use crate::PageServerConf;
|
||||
use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::XLogRecord;
|
||||
use postgres_ffi::XLogRecord;
|
||||
|
||||
///
|
||||
/// WAL Redo Manager is responsible for replaying WAL records.
|
||||
@@ -56,13 +60,31 @@ pub trait WalRedoManager: Send + Sync {
|
||||
/// the reords.
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
tag: ObjectTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError>;
|
||||
}
|
||||
|
||||
///
|
||||
/// A dummy WAL Redo Manager implementation that doesn't allow replaying
|
||||
/// anything. Currently used during bootstrapping (zenith init), to create
|
||||
/// a Repository object without launching the real WAL redo process.
|
||||
///
|
||||
pub struct DummyRedoManager {}
|
||||
impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
_tag: ObjectTag,
|
||||
_lsn: Lsn,
|
||||
_base_img: Option<Bytes>,
|
||||
_records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
Err(WalRedoError::InvalidState)
|
||||
}
|
||||
}
|
||||
|
||||
static TIMEOUT: Duration = Duration::from_secs(20);
|
||||
|
||||
///
|
||||
@@ -82,13 +104,16 @@ struct PostgresRedoManagerInternal {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct WalRedoRequest {
|
||||
tag: BufferTag,
|
||||
struct WalRedoRequestData {
|
||||
tag: ObjectTag,
|
||||
lsn: Lsn,
|
||||
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct WalRedoRequest {
|
||||
data: WalRedoRequestData,
|
||||
response_channel: mpsc::Sender<Result<Bytes, WalRedoError>>,
|
||||
}
|
||||
|
||||
@@ -97,6 +122,9 @@ struct WalRedoRequest {
|
||||
pub enum WalRedoError {
|
||||
#[error(transparent)]
|
||||
IoError(#[from] std::io::Error),
|
||||
|
||||
#[error("cannot perform WAL redo now")]
|
||||
InvalidState,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -142,7 +170,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
///
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
tag: ObjectTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
@@ -151,10 +179,12 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
|
||||
|
||||
let request = WalRedoRequest {
|
||||
tag,
|
||||
lsn,
|
||||
base_img,
|
||||
records,
|
||||
data: WalRedoRequestData {
|
||||
tag,
|
||||
lsn,
|
||||
base_img,
|
||||
records,
|
||||
},
|
||||
response_channel: tx,
|
||||
};
|
||||
|
||||
@@ -170,22 +200,21 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
}
|
||||
|
||||
fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
|
||||
return ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
|
||||
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
|
||||
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
|
||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize;
|
||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
|
||||
}
|
||||
|
||||
fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
|
||||
return (xid as u16) % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP
|
||||
* pg_constants::MXACT_MEMBER_BITS_PER_XACT;
|
||||
(xid as u16) % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP
|
||||
* pg_constants::MXACT_MEMBER_BITS_PER_XACT
|
||||
}
|
||||
|
||||
/* Location (byte offset within page) of TransactionId of given member */
|
||||
fn mx_offset_to_member_offset(xid: MultiXactId) -> usize {
|
||||
return mx_offset_to_flags_offset(xid)
|
||||
mx_offset_to_flags_offset(xid)
|
||||
+ (pg_constants::MULTIXACT_FLAGBYTES_PER_GROUP
|
||||
+ (xid as u16 % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP) * 4)
|
||||
as usize;
|
||||
+ (xid as u16 % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP) * 4) as usize
|
||||
}
|
||||
|
||||
///
|
||||
@@ -206,45 +235,64 @@ impl PostgresRedoManagerInternal {
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let process: PostgresRedoProcess;
|
||||
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = self.conf.workdir.join("wal-redo-datadir");
|
||||
let processes: Vec<PostgresRedoProcess>;
|
||||
|
||||
info!("launching WAL redo postgres process");
|
||||
|
||||
process = runtime
|
||||
.block_on(PostgresRedoProcess::launch(&datadir))
|
||||
.unwrap();
|
||||
let wal_redoers = self.conf.wal_redoers;
|
||||
processes = (0..wal_redoers)
|
||||
.map(|i| {
|
||||
runtime
|
||||
.block_on(PostgresRedoProcess::launch(self.conf, i))
|
||||
.unwrap()
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Loop forever, handling requests as they come.
|
||||
loop {
|
||||
let request = self
|
||||
.request_rx
|
||||
.recv()
|
||||
.expect("WAL redo request channel was closed");
|
||||
let mut requests: Vec<WalRedoRequest> = Vec::new();
|
||||
requests.push(
|
||||
self.request_rx
|
||||
.recv()
|
||||
.expect("WAL redo request channel was closed"),
|
||||
);
|
||||
loop {
|
||||
let req = self.request_rx.try_recv();
|
||||
match req {
|
||||
Ok(req) => requests.push(req),
|
||||
Err(_) => break,
|
||||
}
|
||||
}
|
||||
let request_data = requests.iter().map(|req| &req.data);
|
||||
let mut rr = 0; // round robin
|
||||
let results = runtime.block_on(async {
|
||||
let futures = request_data.map(|req| {
|
||||
rr += 1;
|
||||
self.handle_apply_request(&processes[rr % wal_redoers], &req)
|
||||
});
|
||||
let mut results: Vec<Result<Bytes, WalRedoError>> = Vec::new();
|
||||
for future in futures {
|
||||
results.push(future.await);
|
||||
}
|
||||
results
|
||||
});
|
||||
for (result, request) in results.into_iter().zip(requests.iter()) {
|
||||
let result_ok = result.is_ok();
|
||||
|
||||
let result = runtime.block_on(self.handle_apply_request(&process, &request));
|
||||
let result_ok = result.is_ok();
|
||||
// Send the result to the requester
|
||||
let _ = request.response_channel.send(result);
|
||||
|
||||
// Send the result to the requester
|
||||
let _ = request.response_channel.send(result);
|
||||
|
||||
if !result_ok {
|
||||
error!("wal-redo-postgres failed to apply request {:?}", request);
|
||||
if !result_ok {
|
||||
error!("wal-redo-postgres failed to apply request {:?}", request);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo.
|
||||
///
|
||||
async fn handle_apply_request(
|
||||
&self,
|
||||
process: &PostgresRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
request: &WalRedoRequestData,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let tag = request.tag;
|
||||
let lsn = request.lsn;
|
||||
@@ -256,14 +304,21 @@ impl PostgresRedoManagerInternal {
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
if tag.rel.forknum > pg_constants::INIT_FORKNUM {
|
||||
if let ObjectTag::RelationBuffer(buf_tag) = tag {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
|
||||
} else {
|
||||
// Non-relational WAL records we apply ourselves.
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let mut page = BytesMut::new();
|
||||
if let Some(fpi) = base_img {
|
||||
// If full-page image is provided, then use it...
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
} else {
|
||||
// otherwise initialize page with zeros
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
// Apply all collected WAL records
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
@@ -282,148 +337,78 @@ impl PostgresRedoManagerInternal {
|
||||
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
// The only operation we need to implement is CLOG_ZEROPAGE
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
// Transaction manager stuff
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let mut status = 0;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
|
||||
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
|
||||
}
|
||||
//handle subtrans
|
||||
let _xact_time = buf.get_i64_le();
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
let tag_blknum = match tag {
|
||||
ObjectTag::Clog(slru) => slru.blknum,
|
||||
ObjectTag::TwoPhase(_) => {
|
||||
assert!(info == pg_constants::XLOG_XACT_PREPARE);
|
||||
trace!("Apply prepare {} record", xlogrec.xl_xid);
|
||||
page.clear();
|
||||
page.extend_from_slice(&buf[..]);
|
||||
continue;
|
||||
}
|
||||
_ => panic!("Not valid XACT object tag {:?}", tag),
|
||||
};
|
||||
let parsed_xact =
|
||||
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
|
||||
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag_blknum == blkno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_SUB_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag.blknum == blkno {
|
||||
status = pg_constants::TRANSACTION_STATUS_SUB_COMMITTED;
|
||||
transaction_id_set_status(subxact, status, &mut page);
|
||||
}
|
||||
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag_blknum == blkno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO handle this too?
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
|
||||
let xid = buf.get_u32_le();
|
||||
transaction_id_set_status(xid, status, &mut page);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT || info == pg_constants::XLOG_XACT_ABORT_PREPARED {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
if info == pg_constants::XLOG_XACT_ABORT {
|
||||
transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
|
||||
}
|
||||
//handle subtrans
|
||||
let _xact_time = buf.get_i64_le();
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag.blknum == blkno {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
transaction_id_set_status(subxact, status, &mut page);
|
||||
}
|
||||
}
|
||||
}
|
||||
if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO handle this too?
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
|
||||
let xid = buf.get_u32_le();
|
||||
transaction_id_set_status(xid, status, &mut page);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
info!("Apply prepare {} record", xlogrec.xl_xid);
|
||||
page.clear();
|
||||
page.extend_from_slice(&buf[..]);
|
||||
} else {
|
||||
error!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn,
|
||||
record.main_data_offset, record.rec.len());
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
// Multiexact operations
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
|
||||
{
|
||||
// Just need to zero page
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
if tag.rel.forknum == pg_constants::PG_MXACT_OFFSETS_FORKNUM {
|
||||
let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
|
||||
* 4) as usize;
|
||||
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
|
||||
} else {
|
||||
assert!(tag.rel.forknum == pg_constants::PG_MXACT_MEMBERS_FORKNUM);
|
||||
if let ObjectTag::MultiXactMembers(slru) = tag {
|
||||
for i in 0..xlrec.nmembers {
|
||||
let blkno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
if blkno == tag.blknum {
|
||||
if blkno == slru.blknum {
|
||||
// update only target block
|
||||
let offset = xlrec.moff + i;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
@@ -445,14 +430,17 @@ impl PostgresRedoManagerInternal {
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Multixact offsets SLRU
|
||||
let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
|
||||
* 4) as usize;
|
||||
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
// empty page image indicates that this SLRU page is truncated and can be removed by GC
|
||||
page.clear();
|
||||
} else {
|
||||
assert!(false);
|
||||
panic!();
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
// Ralation map file has size 512 bytes
|
||||
page.clear();
|
||||
page.extend_from_slice(&buf[12..]); // skip xl_relmap_update
|
||||
assert!(page.len() == 512); // size of pg_filenode.map
|
||||
@@ -460,15 +448,13 @@ impl PostgresRedoManagerInternal {
|
||||
}
|
||||
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
} else {
|
||||
apply_result = process.apply_wal_records(tag, base_img, records).await;
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
let result: Result<Bytes, WalRedoError>;
|
||||
|
||||
trace!(
|
||||
debug!(
|
||||
"applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
@@ -490,20 +476,20 @@ impl PostgresRedoManagerInternal {
|
||||
}
|
||||
|
||||
struct PostgresRedoProcess {
|
||||
stdin: RefCell<ChildStdin>,
|
||||
stdout: RefCell<ChildStdout>,
|
||||
stdin: Arc<RefCell<ChildStdin>>,
|
||||
stdout: Arc<RefCell<ChildStdout>>,
|
||||
}
|
||||
|
||||
impl PostgresRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
// Tests who run pageserver binary are setting proper PG_BIN_DIR
|
||||
// and PG_LIB_DIR so that WalRedo would start right postgres.
|
||||
async fn launch(conf: &PageServerConf, id: usize) -> Result<PostgresRedoProcess, Error> {
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = conf.workdir.join(format!("wal-redo-datadir-{}", id));
|
||||
|
||||
// do that: We may later
|
||||
// switch to setting same things in pageserver config file.
|
||||
async fn launch(datadir: &Path) -> Result<PostgresRedoProcess, Error> {
|
||||
// Create empty data directory for wal-redo postgres, deleting old one first.
|
||||
if datadir.exists() {
|
||||
info!("directory {:?} exists, removing", &datadir);
|
||||
@@ -512,9 +498,12 @@ impl PostgresRedoProcess {
|
||||
}
|
||||
}
|
||||
info!("running initdb in {:?}", datadir.display());
|
||||
let initdb = Command::new("initdb")
|
||||
let initdb = Command::new(conf.pg_bin_dir().join("initdb"))
|
||||
.args(&["-D", datadir.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.output()
|
||||
.await
|
||||
.expect("failed to execute initdb");
|
||||
@@ -536,12 +525,15 @@ impl PostgresRedoProcess {
|
||||
config.write_all(b"zenith.wal_redo=on\n")?;
|
||||
}
|
||||
// Start postgres itself
|
||||
let mut child = Command::new("postgres")
|
||||
let mut child = Command::new(conf.pg_bin_dir().join("postgres"))
|
||||
.arg("--wal-redo")
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env("PGDATA", datadir)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGDATA", &datadir)
|
||||
.spawn()
|
||||
.expect("postgres --wal-redo command failed to start");
|
||||
|
||||
@@ -576,8 +568,8 @@ impl PostgresRedoProcess {
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
Ok(PostgresRedoProcess {
|
||||
stdin: RefCell::new(stdin),
|
||||
stdout: RefCell::new(stdout),
|
||||
stdin: Arc::new(RefCell::new(stdin)),
|
||||
stdout: Arc::new(RefCell::new(stdout)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -665,9 +657,26 @@ fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
// FIXME: this is a temporary hack that should go away when we refactor
|
||||
// the postgres protocol serialization + handlers.
|
||||
//
|
||||
// BytesMut is a dynamic growable buffer, used a lot in tokio code but
|
||||
// not in the std library. To write to a BytesMut from a serde serializer,
|
||||
// we need to either:
|
||||
// - pre-allocate the required buffer space. This is annoying because we
|
||||
// shouldn't care what the exact serialized size is-- that's the
|
||||
// serializer's job.
|
||||
// - Or, we need to create a temporary "writer" (which implements the
|
||||
// `Write` trait). It's a bit awkward, because the writer consumes the
|
||||
// underlying BytesMut, and we need to extract it later with
|
||||
// `into_inner`.
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
@@ -680,10 +689,13 @@ fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let mut buf = writer.into_inner();
|
||||
buf.put(base_img);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
@@ -697,7 +709,7 @@ fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put(rec);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
@@ -708,9 +720,12 @@ fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ crc32c = "0.6.0"
|
||||
hex = "0.4.3"
|
||||
lazy_static = "1.4"
|
||||
log = "0.4.14"
|
||||
memoffset = "0.6.2"
|
||||
thiserror = "1.0"
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
|
||||
@@ -1,3 +1,25 @@
|
||||
This module contains utility functions for interacting with PostgreSQL
|
||||
file formats.
|
||||
This module contains utilities for working with PostgreSQL file
|
||||
formats. It's a collection of structs that are auto-generated from the
|
||||
PostgreSQL header files using bindgen, and Rust functions to read and
|
||||
manipulate them.
|
||||
|
||||
There are also a bunch of constants in `pg_constants.rs` that are copied
|
||||
from various PostgreSQL headers, rather than auto-generated. They mostly
|
||||
should be auto-generated too, but that's a TODO.
|
||||
|
||||
The PostgreSQL on-disk file format is not portable across different
|
||||
CPU architectures and operating systems. It is also subject to change
|
||||
in each major PostgreSQL version. Currently, this module is based on
|
||||
PostgreSQL v14, but in the future we will probably need a separate
|
||||
copy for each PostgreSQL version.
|
||||
|
||||
To interact with the C structs, there is some unsafe code in this
|
||||
module. Do not copy-paste that to the rest of the codebase! Keep the
|
||||
amount of unsafe code to a minimum, and limited to this module only,
|
||||
and only where it's truly needed.
|
||||
|
||||
TODO: Currently, there is also some code that deals with WAL records
|
||||
in pageserver/src/waldecoder.rs. That should be moved into this
|
||||
module. The rest of the codebase should not have intimate knowledge of
|
||||
PostgreSQL file formats or WAL layout, that knowledge should be
|
||||
encapsulated in this module.
|
||||
|
||||
@@ -11,29 +11,42 @@ fn main() {
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
let bindings = bindgen::Builder::default()
|
||||
// The input header we would like to generate
|
||||
// bindings for.
|
||||
//
|
||||
// All the needed PostgreSQL headers are included from 'pg_control_ffi.h'
|
||||
//
|
||||
.header("pg_control_ffi.h")
|
||||
//
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
//
|
||||
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
|
||||
//
|
||||
// These are the types and constants that we want to generate bindings for
|
||||
//
|
||||
.whitelist_type("ControlFileData")
|
||||
.whitelist_type("CheckPoint")
|
||||
.whitelist_type("FullTransactionId")
|
||||
.whitelist_type("XLogRecord")
|
||||
.whitelist_type("XLogPageHeaderData")
|
||||
.whitelist_type("XLogLongPageHeaderData")
|
||||
.whitelist_var("XLOG_PAGE_MAGIC")
|
||||
.whitelist_var("PG_CONTROL_FILE_SIZE")
|
||||
.whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
|
||||
.whitelist_type("DBState")
|
||||
//
|
||||
// Path the server include dir. It is in tmp_install/include/server, if you did
|
||||
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
|
||||
// and used DESTDIR to move it into tmp_install, then it's in
|
||||
// tmp_install/include/postgres/server
|
||||
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
|
||||
// but this will do for now.
|
||||
//
|
||||
.clang_arg("-I../tmp_install/include/server")
|
||||
.clang_arg("-I../tmp_install/include/postgresql/server")
|
||||
//
|
||||
// Finish the builder and generate the bindings.
|
||||
//
|
||||
.generate()
|
||||
// Unwrap the Result and panic on failure.
|
||||
.expect("Unable to generate bindings");
|
||||
|
||||
// Write the bindings to the $OUT_DIR/bindings.rs file.
|
||||
|
||||
@@ -1,4 +1,10 @@
|
||||
/*
|
||||
* This header file is the input to bindgen. It includes all the
|
||||
* PostgreSQL headers that we need to auto-generate Rust structs
|
||||
* from. If you need to expose a new struct to Rust code, add the
|
||||
* header here, and whitelist the struct in the build.rs file.
|
||||
*/
|
||||
#include "c.h"
|
||||
#include "catalog/pg_control.h"
|
||||
#include "access/xlog_internal.h"
|
||||
|
||||
const uint32 PG_CONTROLFILEDATA_OFFSETOF_CRC = offsetof(ControlFileData, crc);
|
||||
|
||||
124
postgres_ffi/src/controlfile_utils.rs
Normal file
124
postgres_ffi/src/controlfile_utils.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
//!
|
||||
//! Utilities for reading and writing the PostgreSQL control file.
|
||||
//!
|
||||
//! The PostgreSQL control file is one the first things that the PostgreSQL
|
||||
//! server reads when it starts up. It indicates whether the server was shut
|
||||
//! down cleanly, or if it crashed or was restored from online backup so that
|
||||
//! WAL recovery needs to be performed. It also contains a copy of the latest
|
||||
//! checkpoint record and its location in the WAL.
|
||||
//!
|
||||
//! The control file also contains fields for detecting whether the
|
||||
//! data directory is compatible with a postgres binary. That includes
|
||||
//! a version number, configuration options that can be set at
|
||||
//! compilation time like the block size, and the platform's alignment
|
||||
//! and endianess information. (The PostgreSQL on-disk file format is
|
||||
//! not portable across platforms.)
|
||||
//!
|
||||
//! The control file is stored in the PostgreSQL data directory, as
|
||||
//! `global/pg_control`. The data stored in it is designed to be smaller than
|
||||
//! 512 bytes, on the assumption that it can be updated atomically. The actual
|
||||
//! file is larger, 8192 bytes, but the rest of it is just filled with zeros.
|
||||
//!
|
||||
//! See src/include/catalog/pg_control.h in the PostgreSQL sources for more
|
||||
//! information. You can use PostgreSQL's pg_controldata utility to view its
|
||||
//! contents.
|
||||
//!
|
||||
use crate::{ControlFileData, PG_CONTROL_FILE_SIZE};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
|
||||
/// Equivalent to sizeof(ControlFileData) in C
|
||||
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
|
||||
|
||||
impl ControlFileData {
|
||||
/// Compute the offset of the `crc` field within the `ControlFileData` struct.
|
||||
/// Equivalent to offsetof(ControlFileData, crc) in C.
|
||||
// Someday this can be const when the right compiler features land.
|
||||
fn pg_control_crc_offset() -> usize {
|
||||
memoffset::offset_of!(ControlFileData, crc)
|
||||
}
|
||||
|
||||
///
|
||||
/// Interpret a slice of bytes as a Postgres control file.
|
||||
///
|
||||
pub fn decode(buf: &[u8]) -> Result<ControlFileData> {
|
||||
// Check that the slice has the expected size. The control file is
|
||||
// padded with zeros up to a 512 byte sector size, so accept a
|
||||
// larger size too, so that the caller can just the whole file
|
||||
// contents without knowing the exact size of the struct.
|
||||
if buf.len() < SIZEOF_CONTROLDATA {
|
||||
bail!("control file is too short");
|
||||
}
|
||||
|
||||
// Compute the expected CRC of the content.
|
||||
let OFFSETOF_CRC = Self::pg_control_crc_offset();
|
||||
let expectedcrc = crc32c::crc32c(&buf[0..OFFSETOF_CRC]);
|
||||
|
||||
// Convert the slice into an array of the right size, and use `transmute` to
|
||||
// reinterpret the raw bytes as a ControlFileData struct.
|
||||
//
|
||||
// NB: Ideally we would use 'zerocopy::FromBytes' for this, but bindgen doesn't
|
||||
// derive FromBytes for us. The safety of this depends on the same constraints
|
||||
// as for FromBytes, namely, all of its fields must implement FromBytes. That
|
||||
// includes the primitive integer types, like `u8`, `u16`, `u32`, `u64` and their
|
||||
// signed variants. But `bool` is not safe, because the contents of the high bits
|
||||
// in a rust bool are undefined. In practice, PostgreSQL uses 1 to represent
|
||||
// true and 0 for false, which is compatible with Rust bool, but let's try not to
|
||||
// depend on it.
|
||||
//
|
||||
// FIXME: ControlFileData does contain 'bool's at the moment.
|
||||
//
|
||||
// See https://github.com/zenithdb/zenith/issues/207 for discussion on the safety
|
||||
// of this.
|
||||
let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
|
||||
b.copy_from_slice(&buf[0..SIZEOF_CONTROLDATA]);
|
||||
let controlfile: ControlFileData =
|
||||
unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
// Check the CRC
|
||||
if expectedcrc != controlfile.crc {
|
||||
bail!(
|
||||
"invalid CRC in control file: expected {:08X}, was {:08X}",
|
||||
expectedcrc,
|
||||
controlfile.crc
|
||||
);
|
||||
}
|
||||
|
||||
Ok(controlfile)
|
||||
}
|
||||
|
||||
///
|
||||
/// Convert a struct representing a Postgres control file into raw bytes.
|
||||
///
|
||||
/// The CRC is recomputed to match the contents of the fields.
|
||||
pub fn encode(&self) -> Bytes {
|
||||
//
|
||||
// Use `transmute` to reinterpret struct as raw bytes.
|
||||
//
|
||||
// FIXME: This triggers undefined behavior, because the contents
|
||||
// of the padding bytes are undefined, and this leaks those
|
||||
// undefined bytes into the resulting array. The Rust code won't
|
||||
// care what's in those bytes, and PostgreSQL doesn't care
|
||||
// either. HOWEVER, it is a potential security issue, because the
|
||||
// bytes can contain arbitrary pieces of memory from the page
|
||||
// server. In the worst case, that could be private keys or
|
||||
// another tenant's data.
|
||||
//
|
||||
// See https://github.com/zenithdb/zenith/issues/207 for discussion.
|
||||
let b: [u8; SIZEOF_CONTROLDATA] =
|
||||
unsafe { std::mem::transmute::<ControlFileData, [u8; SIZEOF_CONTROLDATA]>(*self) };
|
||||
|
||||
// Recompute the CRC
|
||||
let OFFSETOF_CRC = Self::pg_control_crc_offset();
|
||||
let newcrc = crc32c::crc32c(&b[0..OFFSETOF_CRC]);
|
||||
|
||||
let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize);
|
||||
buf.extend_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
buf.extend_from_slice(&newcrc.to_ne_bytes());
|
||||
// Fill the rest of the control file with zeros.
|
||||
buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
|
||||
|
||||
buf.into()
|
||||
}
|
||||
}
|
||||
@@ -3,110 +3,8 @@
|
||||
#![allow(non_snake_case)]
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
pub mod controlfile_utils;
|
||||
pub mod nonrelfile_utils;
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
pub mod xlog_utils;
|
||||
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
|
||||
// sizeof(ControlFileData)
|
||||
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
|
||||
const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
|
||||
const OFFSETOF_CRC: usize = PG_CONTROLFILEDATA_OFFSETOF_CRC as usize;
|
||||
|
||||
impl ControlFileData {
|
||||
// Initialize an all-zeros ControlFileData struct
|
||||
pub fn new() -> ControlFileData {
|
||||
let controlfile: ControlFileData;
|
||||
|
||||
let b = [0u8; SIZEOF_CONTROLDATA];
|
||||
controlfile =
|
||||
unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
controlfile
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_pg_control(mut buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
|
||||
let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
|
||||
buf.copy_to_slice(&mut b);
|
||||
|
||||
let controlfile: ControlFileData;
|
||||
|
||||
// TODO: verify CRC
|
||||
let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC];
|
||||
data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
let expectedcrc = crc32c::crc32c(&data_without_crc);
|
||||
|
||||
controlfile = unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
if expectedcrc != controlfile.crc {
|
||||
anyhow::bail!(
|
||||
"invalid CRC in control file: expected {:08X}, was {:08X}",
|
||||
expectedcrc,
|
||||
controlfile.crc
|
||||
);
|
||||
}
|
||||
|
||||
Ok(controlfile)
|
||||
}
|
||||
|
||||
pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes {
|
||||
let b: [u8; SIZEOF_CONTROLDATA];
|
||||
|
||||
b = unsafe { std::mem::transmute::<ControlFileData, [u8; SIZEOF_CONTROLDATA]>(controlfile) };
|
||||
|
||||
// Recompute the CRC
|
||||
let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC];
|
||||
data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
let newcrc = crc32c::crc32c(&data_without_crc);
|
||||
|
||||
let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize);
|
||||
|
||||
buf.extend_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
buf.extend_from_slice(&newcrc.to_ne_bytes());
|
||||
// Fill the rest of the control file with zeros.
|
||||
buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
|
||||
|
||||
buf.into()
|
||||
}
|
||||
|
||||
pub fn encode_checkpoint(checkpoint: CheckPoint) -> Bytes {
|
||||
let b: [u8; SIZEOF_CHECKPOINT];
|
||||
b = unsafe { std::mem::transmute::<CheckPoint, [u8; SIZEOF_CHECKPOINT]>(checkpoint) };
|
||||
return Bytes::copy_from_slice(&b[..]);
|
||||
}
|
||||
|
||||
pub fn decode_checkpoint(mut buf: Bytes) -> Result<CheckPoint, anyhow::Error> {
|
||||
let mut b = [0u8; SIZEOF_CHECKPOINT];
|
||||
buf.copy_to_slice(&mut b);
|
||||
let checkpoint: CheckPoint;
|
||||
checkpoint = unsafe { std::mem::transmute::<[u8; SIZEOF_CHECKPOINT], CheckPoint>(b) };
|
||||
Ok(checkpoint)
|
||||
}
|
||||
|
||||
impl CheckPoint {
|
||||
pub fn new(lsn: u64, timeline: u32) -> CheckPoint {
|
||||
CheckPoint {
|
||||
redo: lsn,
|
||||
ThisTimeLineID: timeline,
|
||||
PrevTimeLineID: timeline,
|
||||
fullPageWrites: true, // TODO: get actual value of full_page_writes
|
||||
nextXid: FullTransactionId {
|
||||
value: pg_constants::FIRST_NORMAL_TRANSACTION_ID as u64,
|
||||
}, // TODO: handle epoch?
|
||||
nextOid: pg_constants::FIRST_BOOTSTRAP_OBJECT_ID,
|
||||
nextMulti: 1,
|
||||
nextMultiOffset: 0,
|
||||
oldestXid: pg_constants::FIRST_NORMAL_TRANSACTION_ID,
|
||||
oldestXidDB: 0,
|
||||
oldestMulti: 1,
|
||||
oldestMultiDB: 0,
|
||||
time: 0,
|
||||
oldestCommitTsXid: 0,
|
||||
newestCommitTsXid: 0,
|
||||
oldestActiveXid: pg_constants::INVALID_TRANSACTION_ID,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,5 +28,5 @@ pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
|
||||
let bshift: u8 =
|
||||
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
return ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8;
|
||||
((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8
|
||||
}
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
//!
|
||||
//! Misc constants, copied from PostgreSQL headers.
|
||||
//!
|
||||
//! TODO: These probably should be auto-generated using bindgen,
|
||||
//! rather than copied by hand. Although on the other hand, it's nice
|
||||
//! to have them all here in one place, and have the ability to add
|
||||
//! comments on them.
|
||||
//!
|
||||
|
||||
//
|
||||
// From pg_tablespace_d.h
|
||||
@@ -15,18 +20,11 @@ pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const FSM_FORKNUM: u8 = 1;
|
||||
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
// Special values for non-rel files' tags (Zenith-specific)
|
||||
//Special values for non-rel files' tags
|
||||
pub const PG_CONTROLFILE_FORKNUM: u8 = 42;
|
||||
pub const PG_FILENODEMAP_FORKNUM: u8 = 43;
|
||||
pub const PG_XACT_FORKNUM: u8 = 44;
|
||||
pub const PG_MXACT_OFFSETS_FORKNUM: u8 = 45;
|
||||
pub const PG_MXACT_MEMBERS_FORKNUM: u8 = 46;
|
||||
pub const PG_TWOPHASE_FORKNUM: u8 = 47;
|
||||
pub const PG_CHECKPOINT_FORKNUM: u8 = 48;
|
||||
|
||||
// From storage_xlog.h
|
||||
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
|
||||
pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
|
||||
pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
|
||||
|
||||
// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and
|
||||
// --with-segsize=SEGSIZE, but assume the defaults for now.
|
||||
@@ -65,9 +63,11 @@ pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
|
||||
|
||||
// From srlu.h
|
||||
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
|
||||
pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
pub const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
pub const XLOG_HEAP_OPMASK: u8 = 0x70;
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
|
||||
|
||||
@@ -88,8 +88,12 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
|
||||
pub const XLOG_NEXTOID: u8 = 0x30;
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
pub const DB_SHUTDOWNED: u32 = 1;
|
||||
|
||||
// From multixact.h
|
||||
pub const FIRST_MULTIXACT_ID: u32 = 1;
|
||||
pub const MAX_MULTIXACT_ID: u32 = 0xFFFFFFFF;
|
||||
|
||||
pub const XLOG_MULTIXACT_ZERO_OFF_PAGE: u8 = 0x00;
|
||||
pub const XLOG_MULTIXACT_ZERO_MEM_PAGE: u8 = 0x10;
|
||||
pub const XLOG_MULTIXACT_CREATE_ID: u8 = 0x20;
|
||||
@@ -176,3 +180,8 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
|
||||
|
||||
/* FIXME: pageserver should request wal_seg_size from compute node */
|
||||
pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
|
||||
|
||||
pub const XLOG_BLCKSZ: usize = 8192;
|
||||
pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
|
||||
pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
|
||||
pub const XLP_LONG_HEADER: u16 = 0x0002;
|
||||
|
||||
@@ -38,16 +38,6 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
pg_constants::FSM_FORKNUM => Some("fsm"),
|
||||
pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"),
|
||||
pg_constants::INIT_FORKNUM => Some("init"),
|
||||
|
||||
// These should not appear in WAL records, but we use them internally,
|
||||
// and need to be prepared to print them out in log messages and such
|
||||
pg_constants::PG_CONTROLFILE_FORKNUM => Some("controlfile"),
|
||||
pg_constants::PG_FILENODEMAP_FORKNUM => Some("filenodemap"),
|
||||
pg_constants::PG_XACT_FORKNUM => Some("xact"),
|
||||
pg_constants::PG_MXACT_OFFSETS_FORKNUM => Some("mxact_offsets"),
|
||||
pg_constants::PG_MXACT_MEMBERS_FORKNUM => Some("mxact_members"),
|
||||
pg_constants::PG_TWOPHASE_FORKNUM => Some("twophase"),
|
||||
|
||||
_ => Some("UNKNOWN FORKNUM"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,8 +8,17 @@
|
||||
//
|
||||
|
||||
use crate::pg_constants;
|
||||
use crate::CheckPoint;
|
||||
use crate::ControlFileData;
|
||||
use crate::FullTransactionId;
|
||||
use crate::XLogLongPageHeaderData;
|
||||
use crate::XLogPageHeaderData;
|
||||
use crate::XLogRecord;
|
||||
use crate::XLOG_PAGE_MAGIC;
|
||||
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, Bytes};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
@@ -21,19 +30,22 @@ use std::time::SystemTime;
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
pub const XLOG_BLCKSZ: usize = 8192;
|
||||
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLOG_PAGE_MAGIC: u16 = 0xD109;
|
||||
pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
|
||||
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = XLP_REM_LEN_OFFS + 4 + 4;
|
||||
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4;
|
||||
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
|
||||
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = XLOG_RECORD_CRC_OFFS + 4;
|
||||
pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
||||
|
||||
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
|
||||
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
|
||||
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
|
||||
pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
|
||||
|
||||
pub type XLogRecPtr = u64;
|
||||
pub type TimeLineID = u32;
|
||||
pub type TimestampTz = u64;
|
||||
pub type TimestampTz = i64;
|
||||
pub type XLogSegNo = u64;
|
||||
|
||||
const XID_CHECKPOINT_INTERVAL: u32 = 1024;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
(0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
|
||||
@@ -83,9 +95,9 @@ pub fn get_current_timestamp() -> TimestampTz {
|
||||
const USECS_PER_SEC: u64 = 1000000;
|
||||
match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) {
|
||||
Ok(n) => {
|
||||
(n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
|
||||
((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
|
||||
* USECS_PER_SEC
|
||||
+ n.subsec_micros() as u64
|
||||
+ n.subsec_micros() as u64) as i64
|
||||
}
|
||||
Err(_) => panic!("SystemTime before UNIX EPOCH!"),
|
||||
}
|
||||
@@ -120,7 +132,7 @@ fn find_end_of_wal_segment(
|
||||
let xlp_magic = LittleEndian::read_u16(&buf[0..2]);
|
||||
let xlp_info = LittleEndian::read_u16(&buf[2..4]);
|
||||
let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]);
|
||||
if xlp_magic != XLOG_PAGE_MAGIC {
|
||||
if xlp_magic != XLOG_PAGE_MAGIC as u16 {
|
||||
info!("Invalid WAL file {}.partial magic {}", file_name, xlp_magic);
|
||||
break;
|
||||
}
|
||||
@@ -199,33 +211,31 @@ pub fn find_end_of_wal(
|
||||
let mut high_tli: TimeLineID = 0;
|
||||
let mut high_ispartial = false;
|
||||
|
||||
for entry in fs::read_dir(data_dir).unwrap() {
|
||||
if let Ok(entry) = entry {
|
||||
let ispartial: bool;
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
/*
|
||||
* Check if the filename looks like an xlog file, or a .partial file.
|
||||
*/
|
||||
if IsXLogFileName(fname) {
|
||||
ispartial = false;
|
||||
} else if IsPartialXLogFileName(fname) {
|
||||
ispartial = true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
|
||||
if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
|
||||
continue;
|
||||
}
|
||||
if segno > high_segno
|
||||
|| (segno == high_segno && tli > high_tli)
|
||||
|| (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
|
||||
{
|
||||
high_segno = segno;
|
||||
high_tli = tli;
|
||||
high_ispartial = ispartial;
|
||||
}
|
||||
for entry in fs::read_dir(data_dir).unwrap().flatten() {
|
||||
let ispartial: bool;
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
/*
|
||||
* Check if the filename looks like an xlog file, or a .partial file.
|
||||
*/
|
||||
if IsXLogFileName(fname) {
|
||||
ispartial = false;
|
||||
} else if IsPartialXLogFileName(fname) {
|
||||
ispartial = true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
|
||||
if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
|
||||
continue;
|
||||
}
|
||||
if segno > high_segno
|
||||
|| (segno == high_segno && tli > high_tli)
|
||||
|| (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
|
||||
{
|
||||
high_segno = segno;
|
||||
high_tli = tli;
|
||||
high_ispartial = ispartial;
|
||||
}
|
||||
}
|
||||
if high_segno > 0 {
|
||||
@@ -259,21 +269,6 @@ pub fn main() {
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// Xlog record parsing routines
|
||||
// TODO move here other related code from waldecoder.rs
|
||||
//
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XLogRecord {
|
||||
pub xl_tot_len: u32,
|
||||
pub xl_xid: u32,
|
||||
pub xl_prev: u64,
|
||||
pub xl_info: u8,
|
||||
pub xl_rmid: u8,
|
||||
pub xl_crc: u32,
|
||||
}
|
||||
|
||||
impl XLogRecord {
|
||||
pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
|
||||
XLogRecord {
|
||||
@@ -289,8 +284,170 @@ impl XLogRecord {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Bytes {
|
||||
let b: [u8; XLOG_SIZE_OF_XLOG_RECORD];
|
||||
b = unsafe { std::mem::transmute::<XLogRecord, [u8; XLOG_SIZE_OF_XLOG_RECORD]>(*self) };
|
||||
Bytes::copy_from_slice(&b[..])
|
||||
}
|
||||
|
||||
// Is this record an XLOG_SWITCH record? They need some special processing,
|
||||
pub fn is_xlog_switch_record(&self) -> bool {
|
||||
self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID
|
||||
}
|
||||
}
|
||||
|
||||
impl XLogPageHeaderData {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogPageHeaderData {
|
||||
let hdr: XLogPageHeaderData = XLogPageHeaderData {
|
||||
xlp_magic: buf.get_u16_le(),
|
||||
xlp_info: buf.get_u16_le(),
|
||||
xlp_tli: buf.get_u32_le(),
|
||||
xlp_pageaddr: buf.get_u64_le(),
|
||||
xlp_rem_len: buf.get_u32_le(),
|
||||
};
|
||||
buf.get_u32_le(); //padding
|
||||
hdr
|
||||
}
|
||||
}
|
||||
|
||||
impl XLogLongPageHeaderData {
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogLongPageHeaderData {
|
||||
XLogLongPageHeaderData {
|
||||
std: XLogPageHeaderData::from_bytes(buf),
|
||||
xlp_sysid: buf.get_u64_le(),
|
||||
xlp_seg_size: buf.get_u32_le(),
|
||||
xlp_xlog_blcksz: buf.get_u32_le(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Bytes {
|
||||
let b: [u8; XLOG_SIZE_OF_XLOG_LONG_PHD];
|
||||
b = unsafe {
|
||||
std::mem::transmute::<XLogLongPageHeaderData, [u8; XLOG_SIZE_OF_XLOG_LONG_PHD]>(*self)
|
||||
};
|
||||
Bytes::copy_from_slice(&b[..])
|
||||
}
|
||||
}
|
||||
|
||||
pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
|
||||
|
||||
impl CheckPoint {
|
||||
pub fn new(lsn: u64, timeline: u32) -> CheckPoint {
|
||||
CheckPoint {
|
||||
redo: lsn,
|
||||
ThisTimeLineID: timeline,
|
||||
PrevTimeLineID: timeline,
|
||||
fullPageWrites: true, // TODO: get actual value of full_page_writes
|
||||
nextXid: FullTransactionId {
|
||||
value: pg_constants::FIRST_NORMAL_TRANSACTION_ID as u64,
|
||||
}, // TODO: handle epoch?
|
||||
nextOid: pg_constants::FIRST_BOOTSTRAP_OBJECT_ID,
|
||||
nextMulti: 1,
|
||||
nextMultiOffset: 0,
|
||||
oldestXid: pg_constants::FIRST_NORMAL_TRANSACTION_ID,
|
||||
oldestXidDB: 0,
|
||||
oldestMulti: 1,
|
||||
oldestMultiDB: 0,
|
||||
time: 0,
|
||||
oldestCommitTsXid: 0,
|
||||
newestCommitTsXid: 0,
|
||||
oldestActiveXid: pg_constants::INVALID_TRANSACTION_ID,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode(&self) -> Bytes {
|
||||
let b: [u8; SIZEOF_CHECKPOINT];
|
||||
b = unsafe { std::mem::transmute::<CheckPoint, [u8; SIZEOF_CHECKPOINT]>(*self) };
|
||||
Bytes::copy_from_slice(&b[..])
|
||||
}
|
||||
|
||||
pub fn decode(buf: &[u8]) -> Result<CheckPoint, anyhow::Error> {
|
||||
let mut b = [0u8; SIZEOF_CHECKPOINT];
|
||||
b.copy_from_slice(&buf[0..SIZEOF_CHECKPOINT]);
|
||||
let checkpoint: CheckPoint;
|
||||
checkpoint = unsafe { std::mem::transmute::<[u8; SIZEOF_CHECKPOINT], CheckPoint>(b) };
|
||||
Ok(checkpoint)
|
||||
}
|
||||
|
||||
// Update next XID based on provided new_xid and stored epoch.
|
||||
// Next XID should be greater than new_xid.
|
||||
// Also take in account 32-bit wrap-around.
|
||||
pub fn update_next_xid(&mut self, xid: u32) {
|
||||
let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
|
||||
let full_xid = self.nextXid.value;
|
||||
let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
let old_xid = full_xid as u32;
|
||||
if new_xid.wrapping_sub(old_xid) as i32 > 0 {
|
||||
let mut epoch = full_xid >> 32;
|
||||
if new_xid < old_xid {
|
||||
// wrap-around
|
||||
epoch += 1;
|
||||
}
|
||||
self.nextXid = FullTransactionId {
|
||||
value: (epoch << 32) | new_xid as u64,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
|
||||
// We need this segment to start compute node.
|
||||
// In order to minimize changes in Postgres core, we prefer to
|
||||
// provide WAL segment from which is can extract checkpoint record in standard way,
|
||||
// rather then implement some alternative mechanism.
|
||||
//
|
||||
pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
|
||||
let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);
|
||||
|
||||
let hdr = XLogLongPageHeaderData {
|
||||
std: {
|
||||
XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: pg_constants::XLP_LONG_HEADER,
|
||||
xlp_tli: 1, // FIXME: always use Postgres timeline 1
|
||||
xlp_pageaddr: pg_control.checkPoint - XLOG_SIZE_OF_XLOG_LONG_PHD as u64,
|
||||
xlp_rem_len: 0,
|
||||
}
|
||||
},
|
||||
xlp_sysid: pg_control.system_identifier,
|
||||
xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32,
|
||||
xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
|
||||
};
|
||||
|
||||
let hdr_bytes = hdr.encode();
|
||||
seg_buf.extend_from_slice(&hdr_bytes);
|
||||
|
||||
let rec_hdr = XLogRecord {
|
||||
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD
|
||||
+ SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT
|
||||
+ SIZEOF_CHECKPOINT) as u32,
|
||||
xl_xid: 0, //0 is for InvalidTransactionId
|
||||
xl_prev: 0,
|
||||
xl_info: pg_constants::XLOG_CHECKPOINT_SHUTDOWN,
|
||||
xl_rmid: pg_constants::RM_XLOG_ID,
|
||||
xl_crc: 0,
|
||||
};
|
||||
|
||||
let mut rec_shord_hdr_bytes = BytesMut::new();
|
||||
rec_shord_hdr_bytes.put_u8(pg_constants::XLR_BLOCK_ID_DATA_SHORT);
|
||||
rec_shord_hdr_bytes.put_u8(SIZEOF_CHECKPOINT as u8);
|
||||
|
||||
let rec_bytes = rec_hdr.encode();
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
|
||||
//calculate record checksum
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &rec_shord_hdr_bytes[..]);
|
||||
crc = crc32c_append(crc, &checkpoint_bytes[..]);
|
||||
crc = crc32c_append(crc, &rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
|
||||
seg_buf.extend_from_slice(&rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
seg_buf.put_u32_le(crc);
|
||||
seg_buf.extend_from_slice(&rec_shord_hdr_bytes);
|
||||
seg_buf.extend_from_slice(&checkpoint_bytes);
|
||||
|
||||
//zero out the rest of the file
|
||||
seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
|
||||
seg_buf.freeze()
|
||||
}
|
||||
|
||||
21
proxy/Cargo.toml
Normal file
21
proxy/Cargo.toml
Normal file
@@ -0,0 +1,21 @@
|
||||
[package]
|
||||
name = "proxy"
|
||||
version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas.kelvich@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
bytes = { version = "1.0.1", features = ['serde'] }
|
||||
md5 = "0.7.0"
|
||||
rand = "0.8.3"
|
||||
hex = "0.4.3"
|
||||
serde = "1"
|
||||
serde_json = "1"
|
||||
tokio = { version = "1.7.1", features = ["full"] }
|
||||
tokio-postgres = "0.7.2"
|
||||
clap = "2.33.0"
|
||||
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
92
proxy/src/cplane_api.rs
Normal file
92
proxy/src/cplane_api.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
net::{IpAddr, SocketAddr},
|
||||
};
|
||||
|
||||
pub struct CPlaneApi {
|
||||
// address: SocketAddr,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: IpAddr, // TODO: allow host name here too
|
||||
pub port: u16,
|
||||
pub dbname: String,
|
||||
pub user: String,
|
||||
pub password: String,
|
||||
}
|
||||
|
||||
impl DatabaseInfo {
|
||||
pub fn socket_addr(&self) -> SocketAddr {
|
||||
SocketAddr::new(self.host, self.port)
|
||||
}
|
||||
|
||||
pub fn conn_string(&self) -> String {
|
||||
format!(
|
||||
"dbname={} user={} password={}",
|
||||
self.dbname, self.user, self.password
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// mock cplane api
|
||||
impl CPlaneApi {
|
||||
pub fn new(_address: &SocketAddr) -> CPlaneApi {
|
||||
CPlaneApi {
|
||||
// address: address.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_auth(&self, user: &str, md5_response: &[u8], salt: &[u8; 4]) -> Result<()> {
|
||||
// passwords for both is "mypass"
|
||||
let auth_map: HashMap<_, &str> = vec![
|
||||
("stas@zenith", "716ee6e1c4a9364d66285452c47402b1"),
|
||||
("stas2@zenith", "3996f75df64c16a8bfaf01301b61d582"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let stored_hash = auth_map
|
||||
.get(&user)
|
||||
.ok_or_else(|| anyhow::Error::msg("user not found"))?;
|
||||
let salted_stored_hash = format!(
|
||||
"md5{:x}",
|
||||
md5::compute([stored_hash.as_bytes(), salt].concat())
|
||||
);
|
||||
|
||||
let received_hash = std::str::from_utf8(&md5_response)?;
|
||||
|
||||
println!(
|
||||
"auth: {} rh={} sh={} ssh={} {:?}",
|
||||
user, received_hash, stored_hash, salted_stored_hash, salt
|
||||
);
|
||||
|
||||
if received_hash == salted_stored_hash {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("Auth failed")
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_database_uri(&self, _user: &str, _database: &str) -> Result<DatabaseInfo> {
|
||||
Ok(DatabaseInfo {
|
||||
host: "127.0.0.1".parse()?,
|
||||
port: 5432,
|
||||
dbname: "stas".to_string(),
|
||||
user: "stas".to_string(),
|
||||
password: "mypass".to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
// pub fn create_database(&self, _user: &String, _database: &String) -> Result<DatabaseInfo> {
|
||||
// Ok(DatabaseInfo {
|
||||
// host: "127.0.0.1".parse()?,
|
||||
// port: 5432,
|
||||
// dbname: "stas".to_string(),
|
||||
// user: "stas".to_string(),
|
||||
// password: "mypass".to_string(),
|
||||
// })
|
||||
// }
|
||||
}
|
||||
106
proxy/src/main.rs
Normal file
106
proxy/src/main.rs
Normal file
@@ -0,0 +1,106 @@
|
||||
///
|
||||
/// Postgres protocol proxy/router.
|
||||
///
|
||||
/// This service listens psql port and can check auth via external service
|
||||
/// (control plane API in our case) and can create new databases and accounts
|
||||
/// in somewhat transparent manner (again via communication with control plane API).
|
||||
///
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
net::{SocketAddr, TcpListener},
|
||||
sync::{mpsc, Mutex},
|
||||
thread,
|
||||
};
|
||||
|
||||
use clap::{App, Arg};
|
||||
|
||||
use cplane_api::DatabaseInfo;
|
||||
|
||||
mod cplane_api;
|
||||
mod mgmt;
|
||||
mod proxy;
|
||||
|
||||
pub struct ProxyConf {
|
||||
/// main entrypoint for users to connect to
|
||||
pub proxy_address: SocketAddr,
|
||||
|
||||
/// http management endpoint. Upon user account creation control plane
|
||||
/// will notify us here, so that we can 'unfreeze' user session.
|
||||
pub mgmt_address: SocketAddr,
|
||||
|
||||
/// send unauthenticated users to this URI
|
||||
pub redirect_uri: String,
|
||||
|
||||
/// control plane address where we would check auth.
|
||||
pub cplane_address: SocketAddr,
|
||||
}
|
||||
|
||||
pub struct ProxyState {
|
||||
pub conf: ProxyConf,
|
||||
pub waiters: Mutex<HashMap<String, mpsc::Sender<anyhow::Result<DatabaseInfo>>>>,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = App::new("Zenith proxy/router")
|
||||
.arg(
|
||||
Arg::with_name("proxy")
|
||||
.short("p")
|
||||
.long("proxy")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming client connections on ip:port")
|
||||
.default_value("127.0.0.1:4432"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("mgmt")
|
||||
.short("m")
|
||||
.long("mgmt")
|
||||
.takes_value(true)
|
||||
.help("listen for management callback connection on ip:port")
|
||||
.default_value("127.0.0.1:7000"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("uri")
|
||||
.short("u")
|
||||
.long("uri")
|
||||
.takes_value(true)
|
||||
.help("redirect unauthenticated users to given uri")
|
||||
.default_value("http://localhost:3000/psql_session/"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let conf = ProxyConf {
|
||||
proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
|
||||
mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
|
||||
redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
|
||||
cplane_address: "127.0.0.1:3000".parse()?,
|
||||
};
|
||||
let state = ProxyState {
|
||||
conf,
|
||||
waiters: Mutex::new(HashMap::new()),
|
||||
};
|
||||
let state: &'static ProxyState = Box::leak(Box::new(state));
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
println!("Starting proxy on {}", state.conf.proxy_address);
|
||||
let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;
|
||||
|
||||
println!("Starting mgmt on {}", state.conf.mgmt_address);
|
||||
let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
|
||||
|
||||
let threads = vec![
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
// for each connection.
|
||||
thread::Builder::new()
|
||||
.name("Proxy thread".into())
|
||||
.spawn(move || proxy::thread_main(&state, pageserver_listener))?,
|
||||
thread::Builder::new()
|
||||
.name("Mgmt thread".into())
|
||||
.spawn(move || mgmt::thread_main(&state, mgmt_listener))?,
|
||||
];
|
||||
|
||||
for t in threads.into_iter() {
|
||||
t.join().unwrap()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
111
proxy/src/mgmt.rs
Normal file
111
proxy/src/mgmt.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
use std::{
|
||||
net::{TcpListener, TcpStream},
|
||||
thread,
|
||||
};
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::{
|
||||
postgres_backend::{self, query_from_cstring, PostgresBackend},
|
||||
pq_proto::{BeMessage, SINGLE_COL_ROWDESC},
|
||||
};
|
||||
|
||||
use crate::{cplane_api::DatabaseInfo, ProxyState};
|
||||
|
||||
///
|
||||
/// Main proxy listener loop.
|
||||
///
|
||||
/// Listens for connections, and launches a new handler thread for each.
|
||||
///
|
||||
pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept()?;
|
||||
println!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = mgmt_conn_main(state, socket) {
|
||||
println!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
let mut conn_handler = MgmtHandler { state };
|
||||
let mut pgbackend = PostgresBackend::new(socket, postgres_backend::AuthType::Trust)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
struct MgmtHandler {
|
||||
state: &'static ProxyState,
|
||||
}
|
||||
/// Serialized examples:
|
||||
// {
|
||||
// "session_id": "71d6d03e6d93d99a",
|
||||
// "result": {
|
||||
// "Success": {
|
||||
// "host": "127.0.0.1",
|
||||
// "port": 5432,
|
||||
// "dbname": "stas",
|
||||
// "user": "stas"
|
||||
// "password": "mypass"
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// {
|
||||
// "session_id": "71d6d03e6d93d99a",
|
||||
// "result": {
|
||||
// "Failure": "oops"
|
||||
// }
|
||||
// }
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct PsqlSessionResponse {
|
||||
session_id: String,
|
||||
result: PsqlSessionResult,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum PsqlSessionResult {
|
||||
Success(DatabaseInfo),
|
||||
Failure(String),
|
||||
}
|
||||
|
||||
impl postgres_backend::Handler for MgmtHandler {
|
||||
fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
let query_string = query_from_cstring(query_string);
|
||||
|
||||
println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
|
||||
|
||||
let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
|
||||
|
||||
let waiters = self.state.waiters.lock().unwrap();
|
||||
|
||||
let sender = waiters
|
||||
.get(&resp.session_id)
|
||||
.ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
|
||||
|
||||
match resp.result {
|
||||
PsqlSessionResult::Success(db_info) => {
|
||||
sender.send(Ok(db_info))?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
PsqlSessionResult::Failure(message) => {
|
||||
sender.send(Err(anyhow::Error::msg(message.clone())))?;
|
||||
|
||||
bail!("psql session request failed: {}", message)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
256
proxy/src/proxy.rs
Normal file
256
proxy/src/proxy.rs
Normal file
@@ -0,0 +1,256 @@
|
||||
use crate::cplane_api::CPlaneApi;
|
||||
use crate::cplane_api::DatabaseInfo;
|
||||
use crate::ProxyState;
|
||||
|
||||
use anyhow::bail;
|
||||
use tokio_postgres::NoTls;
|
||||
|
||||
use rand::Rng;
|
||||
use std::sync::mpsc::channel;
|
||||
use std::thread;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
|
||||
use zenith_utils::pq_proto::*;
|
||||
use zenith_utils::{postgres_backend, pq_proto::BeMessage};
|
||||
|
||||
///
|
||||
/// Main proxy listener loop.
|
||||
///
|
||||
/// Listens for connections, and launches a new handler thread for each.
|
||||
///
|
||||
pub fn thread_main(
|
||||
state: &'static ProxyState,
|
||||
listener: std::net::TcpListener,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept()?;
|
||||
println!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = proxy_conn_main(state, socket) {
|
||||
println!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// XXX: clean up fields
|
||||
struct ProxyConnection {
|
||||
state: &'static ProxyState,
|
||||
|
||||
cplane: CPlaneApi,
|
||||
|
||||
user: String,
|
||||
database: String,
|
||||
|
||||
pgb: PostgresBackend,
|
||||
md5_salt: [u8; 4],
|
||||
|
||||
psql_session_id: String,
|
||||
}
|
||||
|
||||
pub fn proxy_conn_main(
|
||||
state: &'static ProxyState,
|
||||
socket: std::net::TcpStream,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut conn = ProxyConnection {
|
||||
state,
|
||||
cplane: CPlaneApi::new(&state.conf.cplane_address),
|
||||
user: "".into(),
|
||||
database: "".into(),
|
||||
pgb: PostgresBackend::new(socket, postgres_backend::AuthType::MD5)?,
|
||||
md5_salt: [0u8; 4],
|
||||
psql_session_id: "".into(),
|
||||
};
|
||||
|
||||
// Check StartupMessage
|
||||
// This will set conn.existing_user and we can decide on next actions
|
||||
conn.handle_startup()?;
|
||||
|
||||
// both scenarious here should end up producing database connection string
|
||||
let db_info = if conn.is_existing_user() {
|
||||
conn.handle_existing_user()?
|
||||
} else {
|
||||
conn.handle_new_user()?
|
||||
};
|
||||
|
||||
// ok, proxy pass user connection to database_uri
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let _ = runtime.block_on(proxy_pass(conn.pgb, db_info))?;
|
||||
|
||||
println!("proxy_conn_main done;");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl ProxyConnection {
|
||||
fn is_existing_user(&self) -> bool {
|
||||
self.user.ends_with("@zenith")
|
||||
}
|
||||
|
||||
fn handle_startup(&mut self) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let msg = self.pgb.read_message()?;
|
||||
println!("got message {:?}", msg);
|
||||
match msg {
|
||||
Some(FeMessage::StartupMessage(m)) => {
|
||||
println!("got startup message {:?}", m);
|
||||
|
||||
match m.kind {
|
||||
StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => {
|
||||
println!("SSL requested");
|
||||
self.pgb.write_message(&BeMessage::Negotiate)?;
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
self.user = m
|
||||
.params
|
||||
.get("user")
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::msg("user is required in startup packet")
|
||||
})?
|
||||
.into();
|
||||
self.database = m
|
||||
.params
|
||||
.get("database")
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::msg("database is required in startup packet")
|
||||
})?
|
||||
.into();
|
||||
|
||||
break;
|
||||
}
|
||||
StartupRequestCode::Cancel => break,
|
||||
}
|
||||
}
|
||||
None => {
|
||||
bail!("connection closed")
|
||||
}
|
||||
unexpected => {
|
||||
bail!("unexpected message type : {:?}", unexpected)
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
// ask password
|
||||
rand::thread_rng().fill(&mut self.md5_salt);
|
||||
self.pgb
|
||||
.write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
|
||||
self.pgb.state = ProtoState::Authentication; // XXX
|
||||
|
||||
// check password
|
||||
println!("handle_existing_user");
|
||||
let msg = self.pgb.read_message()?;
|
||||
println!("got message {:?}", msg);
|
||||
if let Some(FeMessage::PasswordMessage(m)) = msg {
|
||||
println!("got password message '{:?}'", m);
|
||||
|
||||
assert!(self.is_existing_user());
|
||||
|
||||
let (_trailing_null, md5_response) = m
|
||||
.split_last()
|
||||
.ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;
|
||||
|
||||
if let Err(e) = self.check_auth_md5(md5_response) {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
|
||||
bail!("auth failed: {}", e);
|
||||
} else {
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
}
|
||||
|
||||
// ok, we are authorized
|
||||
self.cplane.get_database_uri(&self.user, &self.database)
|
||||
}
|
||||
|
||||
fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
let mut psql_session_id_buf = [0u8; 8];
|
||||
rand::thread_rng().fill(&mut psql_session_id_buf);
|
||||
self.psql_session_id = hex::encode(psql_session_id_buf);
|
||||
|
||||
let hello_message = format!("☀️ Welcome to Zenith!
|
||||
|
||||
To proceed with database creation open following link:
|
||||
|
||||
{}{}
|
||||
|
||||
It needed to be done once and we will send you '.pgpass' file which will allow you to access or create
|
||||
databases without opening the browser.
|
||||
|
||||
", self.state.conf.redirect_uri,self.psql_session_id);
|
||||
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb
|
||||
.write_message(&BeMessage::NoticeResponse(hello_message))?;
|
||||
|
||||
// await for database creation
|
||||
let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
|
||||
let _ = self
|
||||
.state
|
||||
.waiters
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(self.psql_session_id.clone(), tx);
|
||||
|
||||
// Wait for web console response
|
||||
// XXX: respond with error to client
|
||||
let dbinfo = rx.recv()??;
|
||||
|
||||
self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
|
||||
"Connecting to database.".to_string(),
|
||||
))?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
Ok(dbinfo)
|
||||
}
|
||||
|
||||
fn check_auth_md5(&self, md5_response: &[u8]) -> anyhow::Result<()> {
|
||||
assert!(self.is_existing_user());
|
||||
self.cplane
|
||||
.check_auth(self.user.as_str(), md5_response, &self.md5_salt)
|
||||
}
|
||||
}
|
||||
|
||||
async fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?;
|
||||
let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
|
||||
let _ = config.connect_raw(&mut socket, NoTls).await?;
|
||||
|
||||
println!("Connected to pg, proxying");
|
||||
|
||||
let incoming_std = pgb.into_stream();
|
||||
incoming_std.set_nonblocking(true)?;
|
||||
let mut incoming_conn = tokio::net::TcpStream::from_std(incoming_std)?;
|
||||
|
||||
let (mut ri, mut wi) = incoming_conn.split();
|
||||
let (mut ro, mut wo) = socket.split();
|
||||
|
||||
let client_to_server = async {
|
||||
tokio::io::copy(&mut ri, &mut wo).await?;
|
||||
wo.shutdown().await
|
||||
};
|
||||
|
||||
let server_to_client = async {
|
||||
tokio::io::copy(&mut ro, &mut wi).await?;
|
||||
wi.shutdown().await
|
||||
};
|
||||
|
||||
tokio::try_join!(client_to_server, server_to_client)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
11
run_clippy.sh
Executable file
11
run_clippy.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
# If you save this in your path under the name "cargo-zclippy" (or whatever
|
||||
# name you like), then you can run it as "cargo zclippy" from the shell prompt.
|
||||
#
|
||||
# If your text editor has rust-analyzer integration, you can also use this new
|
||||
# command as a replacement for "cargo check" or "cargo clippy" and see clippy
|
||||
# warnings and errors right in the editor.
|
||||
# In vscode, this setting is Rust-analyzer>Check On Save:Command
|
||||
|
||||
cargo clippy "${@:2}" -- -A clippy::new_without_default -A clippy::manual_range_contains -A clippy::comparison_chain
|
||||
18
test_runner/Pipfile
Normal file
18
test_runner/Pipfile
Normal file
@@ -0,0 +1,18 @@
|
||||
[[source]]
|
||||
url = "https://pypi.python.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
pytest = ">=6.0.0"
|
||||
psycopg2 = "*"
|
||||
typing-extensions = "*"
|
||||
|
||||
[dev-packages]
|
||||
yapf = "*"
|
||||
flake8 = "*"
|
||||
mypy = "*"
|
||||
|
||||
[requires]
|
||||
# we need at least 3.6, but pipenv doesn't allow to say this directly
|
||||
python_version = "3"
|
||||
269
test_runner/Pipfile.lock
generated
Normal file
269
test_runner/Pipfile.lock
generated
Normal file
@@ -0,0 +1,269 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "4c20c05c20c50cf7e8f78ab461ab23841125345e63e00e2efa7661c165b6b364"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.python.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
|
||||
"sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.2.0"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:833b26fb89d5de469b24a390e9df088d4e52e4ba33b01dc5e0e4f41b81a16c00",
|
||||
"sha256:b142cc1dd1342f31ff04bb7d022492b09920cb64fed867cd3ea6f80fe3ebd139"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==4.5.0"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
|
||||
],
|
||||
"version": "==1.1.1"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5",
|
||||
"sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==20.9"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
|
||||
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.13.1"
|
||||
},
|
||||
"psycopg2": {
|
||||
"hashes": [
|
||||
"sha256:03a485bf71498870e38b535c0e6e7162d6ac06a91487edddc3b959894d65f79c",
|
||||
"sha256:22102cfeb904898254f287b1a77360bf66c636858e7476593acd5267e5c24ff9",
|
||||
"sha256:8f4c1800e57ad128d20b2e91d222ca238fffd316cef65be781361cdf35e37979",
|
||||
"sha256:b12073fdf2002e828e5921be2c39ff9c6eab361c5c0bd6c529619fc23677accc",
|
||||
"sha256:b6f47af317af8110818d255e693cfa80b7f1e435285be09778db7b66efd95789",
|
||||
"sha256:d549db98fc0e6db41a2aa0d65f7434c4308a9f64012adb209b9e489f26fe87c6",
|
||||
"sha256:e44e39a46af7c30566b7667fb27e701e652ab0a51e05c263a01d3ff0e223b765",
|
||||
"sha256:e84c80be7a238d3c9c099b71f6890eaa35fc881146232cce888a88ab1bfb431e",
|
||||
"sha256:f3d42bd42302293767b84206d9a446abc67ed4a133e4fe04dad8952de06c2091"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.9"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
"sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
|
||||
"sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.10.0"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
|
||||
"sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.4.7"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b",
|
||||
"sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==6.2.4"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497",
|
||||
"sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342",
|
||||
"sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.0"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76",
|
||||
"sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.4.1"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"flake8": {
|
||||
"hashes": [
|
||||
"sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b",
|
||||
"sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.9.2"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:833b26fb89d5de469b24a390e9df088d4e52e4ba33b01dc5e0e4f41b81a16c00",
|
||||
"sha256:b142cc1dd1342f31ff04bb7d022492b09920cb64fed867cd3ea6f80fe3ebd139"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==4.5.0"
|
||||
},
|
||||
"mccabe": {
|
||||
"hashes": [
|
||||
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
|
||||
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
|
||||
],
|
||||
"version": "==0.6.1"
|
||||
},
|
||||
"mypy": {
|
||||
"hashes": [
|
||||
"sha256:0190fb77e93ce971954c9e54ea61de2802065174e5e990c9d4c1d0f54fbeeca2",
|
||||
"sha256:0756529da2dd4d53d26096b7969ce0a47997123261a5432b48cc6848a2cb0bd4",
|
||||
"sha256:2f9fedc1f186697fda191e634ac1d02f03d4c260212ccb018fabbb6d4b03eee8",
|
||||
"sha256:353aac2ce41ddeaf7599f1c73fed2b75750bef3b44b6ad12985a991bc002a0da",
|
||||
"sha256:3f12705eabdd274b98f676e3e5a89f247ea86dc1af48a2d5a2b080abac4e1243",
|
||||
"sha256:4efc67b9b3e2fddbe395700f91d5b8deb5980bfaaccb77b306310bd0b9e002eb",
|
||||
"sha256:517e7528d1be7e187a5db7f0a3e479747307c1b897d9706b1c662014faba3116",
|
||||
"sha256:68a098c104ae2b75e946b107ef69dd8398d54cb52ad57580dfb9fc78f7f997f0",
|
||||
"sha256:746e0b0101b8efec34902810047f26a8c80e1efbb4fc554956d848c05ef85d76",
|
||||
"sha256:8be7bbd091886bde9fcafed8dd089a766fa76eb223135fe5c9e9798f78023a20",
|
||||
"sha256:9236c21194fde5df1b4d8ebc2ef2c1f2a5dc7f18bcbea54274937cae2e20a01c",
|
||||
"sha256:9ef5355eaaf7a23ab157c21a44c614365238a7bdb3552ec3b80c393697d974e1",
|
||||
"sha256:9f1d74eeb3f58c7bd3f3f92b8f63cb1678466a55e2c4612bf36909105d0724ab",
|
||||
"sha256:a26d0e53e90815c765f91966442775cf03b8a7514a4e960de7b5320208b07269",
|
||||
"sha256:ae94c31bb556ddb2310e4f913b706696ccbd43c62d3331cd3511caef466871d2",
|
||||
"sha256:b5ba1f0d5f9087e03bf5958c28d421a03a4c1ad260bf81556195dffeccd979c4",
|
||||
"sha256:b5dfcd22c6bab08dfeded8d5b44bdcb68c6f1ab261861e35c470b89074f78a70",
|
||||
"sha256:cd01c599cf9f897b6b6c6b5d8b182557fb7d99326bcdf5d449a0fbbb4ccee4b9",
|
||||
"sha256:e89880168c67cf4fde4506b80ee42f1537ad66ad366c101d388b3fd7d7ce2afd",
|
||||
"sha256:ebe2bc9cb638475f5d39068d2dbe8ae1d605bb8d8d3ff281c695df1670ab3987",
|
||||
"sha256:f89bfda7f0f66b789792ab64ce0978e4a991a0e4dd6197349d0767b0f1095b21",
|
||||
"sha256:fc4d63da57ef0e8cd4ab45131f3fe5c286ce7dd7f032650d0fbc239c6190e167",
|
||||
"sha256:fd634bc17b1e2d6ce716f0e43446d0d61cdadb1efcad5c56ca211c22b246ebc8"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.902"
|
||||
},
|
||||
"mypy-extensions": {
|
||||
"hashes": [
|
||||
"sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
|
||||
"sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
|
||||
],
|
||||
"version": "==0.4.3"
|
||||
},
|
||||
"pycodestyle": {
|
||||
"hashes": [
|
||||
"sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068",
|
||||
"sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.7.0"
|
||||
},
|
||||
"pyflakes": {
|
||||
"hashes": [
|
||||
"sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3",
|
||||
"sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.3.1"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typed-ast": {
|
||||
"hashes": [
|
||||
"sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
|
||||
"sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
|
||||
"sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
|
||||
"sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
|
||||
"sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
|
||||
"sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
|
||||
"sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
|
||||
"sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
|
||||
"sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
|
||||
"sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
|
||||
"sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
|
||||
"sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
|
||||
"sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
|
||||
"sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
|
||||
"sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
|
||||
"sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
|
||||
"sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
|
||||
"sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
|
||||
"sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
|
||||
"sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
|
||||
"sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
|
||||
"sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
|
||||
"sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
|
||||
"sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
|
||||
"sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
|
||||
"sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
|
||||
"sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
|
||||
"sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
|
||||
"sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
|
||||
"sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==1.4.3"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497",
|
||||
"sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342",
|
||||
"sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.0"
|
||||
},
|
||||
"yapf": {
|
||||
"hashes": [
|
||||
"sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
|
||||
"sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.31.0"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76",
|
||||
"sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.4.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,13 +4,10 @@ This directory contains integration tests.
|
||||
|
||||
Prerequisites:
|
||||
- Python 3.6 or later
|
||||
- Python packages: pytest, psycopg2
|
||||
- pytest 6.0 is required.
|
||||
- __NOTE: `apt install` on Debian/Ubuntu won't work.__
|
||||
They ship a much older version of pytest (and sometimes rename it to
|
||||
`pytest-3`.)
|
||||
- Install using something like this:
|
||||
- `pip3 install pytest psycopg2` (Debian or Ubuntu)
|
||||
- Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
|
||||
packages are stale, as it commonly happens, so manual installation is not
|
||||
recommended.
|
||||
Run `pipenv shell` to activate the venv.
|
||||
- Zenith and Postgres binaries
|
||||
- See the root README.md for build directions
|
||||
- Tests can be run from the git tree; or see the environment variables
|
||||
@@ -72,7 +69,8 @@ The tests make heavy use of pytest fixtures. You can read about how they work he
|
||||
Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
|
||||
|
||||
So this code:
|
||||
```
|
||||
|
||||
```python
|
||||
def test_something(zenith_cli, pg_bin):
|
||||
pass
|
||||
```
|
||||
@@ -80,9 +78,11 @@ def test_something(zenith_cli, pg_bin):
|
||||
... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
|
||||
|
||||
Fixtures can't be imported using the normal python syntax. Instead, use this:
|
||||
```
|
||||
|
||||
```python
|
||||
pytest_plugins = ("fixtures.something")
|
||||
```
|
||||
|
||||
That will make all the fixtures in the `fixtures/something.py` file available.
|
||||
|
||||
Anything that's likely to be used in multiple tests should be built into a fixture.
|
||||
@@ -90,3 +90,15 @@ Anything that's likely to be used in multiple tests should be built into a fixtu
|
||||
Note that fixtures can clean up after themselves if they use the `yield` syntax.
|
||||
Cleanup will happen even if the test fails (raises an unhandled exception).
|
||||
Python destructors, e.g. `__del__()` aren't recommended for cleanup.
|
||||
|
||||
|
||||
### Code quality
|
||||
|
||||
Before submitting a patch, please consider:
|
||||
|
||||
* Writing a couple of docstrings to clarify the reasoning behind a new test.
|
||||
* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
|
||||
* Formatting the code with `yapf -r -i .` (TODO: implement an opt-in pre-commit hook for that).
|
||||
* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.
|
||||
|
||||
The tools can be installed with `pipenv install --dev`.
|
||||
|
||||
@@ -1,67 +1,73 @@
|
||||
import pytest
|
||||
import getpass
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Create a couple of branches off the main branch, at a historical point in time.
|
||||
#
|
||||
def test_branch_behind(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind", "empty"]);
|
||||
zenith_cli.run(["branch", "test_branch_behind", "empty"])
|
||||
|
||||
pgmain = postgres.create_start('test_branch_behind')
|
||||
print("postgres is running on 'test_branch_behind' branch")
|
||||
|
||||
main_pg_conn = psycopg2.connect(pgmain.connstr());
|
||||
main_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
|
||||
# Create table, and insert the first 100 rows
|
||||
main_cur.execute('CREATE TABLE foo (t text)');
|
||||
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100) g");
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
main_cur.execute('CREATE TABLE foo (t text)')
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_a = main_cur.fetchone()[0]
|
||||
print('LSN after 100 rows: ' + lsn_a)
|
||||
|
||||
# Insert some more rows. (This generates enough WAL to fill a few segments.)
|
||||
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g");
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
print('LSN after 100100 rows: ' + lsn_b)
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@"+lsn_a]);
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
|
||||
|
||||
# Insert many more rows. This generates enough WAL to fill a few segments.
|
||||
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g");
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
print('LSN after 200100 rows: ' + lsn_c)
|
||||
|
||||
# Branch at the point where only 200 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@"+lsn_b]);
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
|
||||
|
||||
pg_hundred = postgres.create_start("test_branch_behind_hundred")
|
||||
pg_more = postgres.create_start("test_branch_behind_more")
|
||||
|
||||
# On the 'hundred' branch, we should see only 100 rows
|
||||
hundred_pg_conn = psycopg2.connect(pg_hundred.connstr())
|
||||
hundred_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
hundred_pg_conn = pg_hundred.connect()
|
||||
hundred_cur = hundred_pg_conn.cursor()
|
||||
hundred_cur.execute('SELECT count(*) FROM foo');
|
||||
assert(hundred_cur.fetchone()[0] == 100);
|
||||
hundred_cur.execute('SELECT count(*) FROM foo')
|
||||
assert hundred_cur.fetchone() == (100, )
|
||||
|
||||
# On the 'more' branch, we should see 100200 rows
|
||||
more_pg_conn = psycopg2.connect(pg_more.connstr())
|
||||
more_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
more_pg_conn = pg_more.connect()
|
||||
more_cur = more_pg_conn.cursor()
|
||||
more_cur.execute('SELECT count(*) FROM foo');
|
||||
assert(more_cur.fetchone()[0] == 100100);
|
||||
more_cur.execute('SELECT count(*) FROM foo')
|
||||
assert more_cur.fetchone() == (100100, )
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo');
|
||||
assert(main_cur.fetchone()[0] == 200100);
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (200100, )
|
||||
|
||||
25
test_runner/batch_others/test_bulk_insert.py
Normal file
25
test_runner/batch_others/test_bulk_insert.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from contextlib import closing
|
||||
import psycopg2.extras
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Test insertion of larg number of records
|
||||
#
|
||||
# This test is pretty tightly coupled with the current implementation of page version storage
|
||||
# and garbage collection in object_repository.rs.
|
||||
#
|
||||
def test_bulk_insert(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_bulk_insert", "empty"])
|
||||
pg = postgres.create_start('test_bulk_insert')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create table t(c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint)")
|
||||
cur.execute("create index on t(c1)")
|
||||
cur.execute("create index on t(c2)")
|
||||
cur.execute("create index on t(c3)")
|
||||
cur.execute("create index on t(c4)")
|
||||
cur.execute("create index on t(c5)")
|
||||
cur.execute("insert into t values (generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000))")
|
||||
cur.execute("insert into t values (generate_series(1,1000000),random()*1000000,random()*1000000,random()*1000000,random()*1000000)")
|
||||
@@ -1,7 +1,4 @@
|
||||
import pytest
|
||||
import os
|
||||
import getpass
|
||||
import psycopg2
|
||||
from contextlib import closing
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -11,20 +8,22 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
#
|
||||
def test_config(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_config", "empty"]);
|
||||
zenith_cli.run(["branch", "test_config", "empty"])
|
||||
|
||||
# change config
|
||||
pg = postgres.create_start('test_config', ['log_min_messages=debug1'])
|
||||
pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
|
||||
print('postgres is running on test_config branch')
|
||||
|
||||
pg_conn = psycopg2.connect(pg.connstr())
|
||||
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = pg_conn.cursor()
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('''
|
||||
SELECT setting
|
||||
FROM pg_settings
|
||||
WHERE
|
||||
source != 'default'
|
||||
AND source != 'override'
|
||||
AND name = 'log_min_messages'
|
||||
''')
|
||||
|
||||
#check that config change was applied
|
||||
cur.execute('SELECT name, setting from pg_settings WHERE source!=%s and source!=%s', ("default","override",))
|
||||
for record in cur:
|
||||
if record[0] == 'log_min_messages':
|
||||
assert(record[1] == 'debug1')
|
||||
|
||||
pg_conn.close()
|
||||
# check that config change was applied
|
||||
assert cur.fetchone() == ('debug1', )
|
||||
|
||||
@@ -1,37 +1,32 @@
|
||||
import pytest
|
||||
import getpass
|
||||
import psycopg2
|
||||
from contextlib import closing
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test CREATE DATABASE when there have been relmapper changes
|
||||
#
|
||||
def test_createdb(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_createdb", "empty"]);
|
||||
zenith_cli.run(["branch", "test_createdb", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_createdb')
|
||||
print("postgres is running on 'test_createdb' branch")
|
||||
|
||||
conn = psycopg2.connect(pg.connstr());
|
||||
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = conn.cursor()
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Cause a 'relmapper' change in the original branch
|
||||
cur.execute('VACUUM FULL pg_class')
|
||||
|
||||
# Cause a 'relmapper' change in the original branch
|
||||
cur.execute('VACUUM FULL pg_class');
|
||||
cur.execute('CREATE DATABASE foodb')
|
||||
|
||||
cur.execute('CREATE DATABASE foodb');
|
||||
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn = cur.fetchone()[0]
|
||||
|
||||
conn.close();
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn = cur.fetchone()[0]
|
||||
|
||||
# Create a branch
|
||||
zenith_cli.run(["branch", "test_createdb2", "test_createdb@"+lsn]);
|
||||
zenith_cli.run(["branch", "test_createdb2", "test_createdb@" + lsn])
|
||||
|
||||
pg2 = postgres.create_start('test_createdb2')
|
||||
|
||||
# Test that you can connect to the new database on both branches
|
||||
conn = psycopg2.connect(pg.connstr('foodb'));
|
||||
conn2 = psycopg2.connect(pg2.connstr('foodb'));
|
||||
for db in (pg, pg2):
|
||||
db.connect(dbname='foodb').close()
|
||||
|
||||
31
test_runner/batch_others/test_createuser.py
Normal file
31
test_runner/batch_others/test_createuser.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from contextlib import closing
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test CREATE USER to check shared catalog restore
|
||||
#
|
||||
def test_createuser(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_createuser", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_createuser')
|
||||
print("postgres is running on 'test_createuser' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Cause a 'relmapper' change in the original branch
|
||||
cur.execute('CREATE USER testuser with password %s', ('testpwd', ))
|
||||
|
||||
cur.execute('CHECKPOINT')
|
||||
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn = cur.fetchone()[0]
|
||||
|
||||
# Create a branch
|
||||
zenith_cli.run(["branch", "test_createuser2", "test_createuser@" + lsn])
|
||||
|
||||
pg2 = postgres.create_start('test_createuser2')
|
||||
|
||||
# Test that you can connect to new branch as a new user
|
||||
assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]
|
||||
97
test_runner/batch_others/test_gc.py
Normal file
97
test_runner/batch_others/test_gc.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from contextlib import closing
|
||||
import psycopg2.extras
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Test Garbage Collection of old page versions.
|
||||
#
|
||||
# This test is pretty tightly coupled with the current implementation of page version storage
|
||||
# and garbage collection in object_repository.rs.
|
||||
#
|
||||
def test_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_gc", "empty"])
|
||||
pg = postgres.create_start('test_gc')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
with closing(pageserver.connect()) as psconn:
|
||||
with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
# Create a test table
|
||||
cur.execute("CREATE TABLE foo(x integer)")
|
||||
|
||||
# Run GC, to clear out any old page versions left behind in the catalogs by
|
||||
# the CREATE TABLE command. We want to have a clean slate with no garbage
|
||||
# before running the actual tests below, otherwise the counts won't match
|
||||
# what we expect.
|
||||
print("Running GC before test")
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
# remember the number of relations
|
||||
n_relations = row['n_relations']
|
||||
assert n_relations > 0
|
||||
|
||||
# Insert a row. The first insert will also create a metadata entry for the
|
||||
# relation, with size == 1 block. Hence, bump up the expected relation count.
|
||||
n_relations += 1;
|
||||
print("Inserting one row and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (1)")
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 3
|
||||
|
||||
# Insert two more rows and run GC.
|
||||
print("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 2
|
||||
|
||||
# Insert one more row. It creates one more page version, but doesn't affect the
|
||||
# relation size.
|
||||
print("Inserting one more row")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 1
|
||||
|
||||
# Run GC again, with no changes in the database. Should not remove anything.
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 0
|
||||
|
||||
#
|
||||
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
|
||||
#
|
||||
cur.execute("DROP TABLE foo")
|
||||
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
assert row['dropped'] == 3
|
||||
63
test_runner/batch_others/test_multixact.py
Normal file
63
test_runner/batch_others/test_multixact.py
Normal file
@@ -0,0 +1,63 @@
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test multixact state after branching
|
||||
# Now this test is very minimalistic -
|
||||
# it only checks next_multixact_id field in restored pg_control,
|
||||
# since we don't have functions to check multixact internals.
|
||||
#
|
||||
def test_multixact(pageserver, postgres, pg_bin, zenith_cli, base_dir):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_multixact", "empty"])
|
||||
pg = postgres.create_start('test_multixact')
|
||||
|
||||
print("postgres is running on 'test_multixact' branch")
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
cur.execute('''
|
||||
CREATE TABLE t1(i int primary key);
|
||||
INSERT INTO t1 select * from generate_series(1, 100);
|
||||
''')
|
||||
|
||||
cur.execute('SELECT next_multixact_id FROM pg_control_checkpoint()')
|
||||
next_multixact_id_old = cur.fetchone()[0]
|
||||
|
||||
# Lock entries in parallel connections to set multixact
|
||||
nclients = 3
|
||||
connections = []
|
||||
for i in range(nclients):
|
||||
# Do not turn on autocommit. We want to hold the key-share locks.
|
||||
conn = pg.connect(autocommit=False)
|
||||
conn.cursor().execute('select * from t1 for key share')
|
||||
connections.append(conn)
|
||||
|
||||
# We should have a multixact now. We can close the connections.
|
||||
for c in connections:
|
||||
c.close()
|
||||
|
||||
# force wal flush
|
||||
cur.execute('checkpoint')
|
||||
|
||||
cur.execute('SELECT next_multixact_id, pg_current_wal_flush_lsn() FROM pg_control_checkpoint()')
|
||||
res = cur.fetchone()
|
||||
next_multixact_id = res[0]
|
||||
lsn = res[1]
|
||||
|
||||
# Ensure that we did lock some tuples
|
||||
assert int(next_multixact_id) > int(next_multixact_id_old)
|
||||
|
||||
# Branch at this point
|
||||
zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
|
||||
pg_new = postgres.create_start('test_multixact_new')
|
||||
|
||||
print("postgres is running on 'test_multixact_new' branch")
|
||||
pg_new_conn = pg_new.connect()
|
||||
cur_new = pg_new_conn.cursor()
|
||||
|
||||
cur_new.execute('SELECT next_multixact_id FROM pg_control_checkpoint()')
|
||||
next_multixact_id_new = cur_new.fetchone()[0]
|
||||
|
||||
# Check that we restored pg_controlfile correctly
|
||||
assert next_multixact_id_new == next_multixact_id
|
||||
@@ -1,29 +1,23 @@
|
||||
import pytest
|
||||
import psycopg2
|
||||
import getpass
|
||||
import json
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_status(pageserver):
|
||||
pg_conn = psycopg2.connect(pageserver.connstr())
|
||||
pg_conn.autocommit = True
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute('status;')
|
||||
assert cur.fetchone() == ('hello world',)
|
||||
pg_conn.close()
|
||||
assert pageserver.safe_psql('status') == [
|
||||
('hello world', ),
|
||||
]
|
||||
|
||||
|
||||
def test_branch_list(pageserver, zenith_cli):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_branch_list_main", "empty"]);
|
||||
zenith_cli.run(["branch", "test_branch_list_main", "empty"])
|
||||
|
||||
page_server_conn = psycopg2.connect(pageserver.connstr())
|
||||
page_server_conn.autocommit = True
|
||||
page_server_cur = page_server_conn.cursor()
|
||||
conn = pageserver.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
page_server_cur.execute('branch_list;')
|
||||
branches = json.loads(page_server_cur.fetchone()[0])
|
||||
cur.execute('branch_list')
|
||||
branches = json.loads(cur.fetchone()[0])
|
||||
# Filter out branches created by other tests
|
||||
branches = [x for x in branches if x['name'].startswith('test_branch_list')]
|
||||
|
||||
@@ -38,8 +32,8 @@ def test_branch_list(pageserver, zenith_cli):
|
||||
zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
|
||||
zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
|
||||
|
||||
page_server_cur.execute('branch_list;')
|
||||
new_branches = json.loads(page_server_cur.fetchone()[0])
|
||||
cur.execute('branch_list')
|
||||
new_branches = json.loads(cur.fetchone()[0])
|
||||
# Filter out branches created by other tests
|
||||
new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
|
||||
assert len(new_branches) == 2
|
||||
@@ -51,4 +45,4 @@ def test_branch_list(pageserver, zenith_cli):
|
||||
# TODO: do the LSNs have to match here?
|
||||
assert new_branches[1] == branches[0]
|
||||
|
||||
page_server_conn.close()
|
||||
conn.close()
|
||||
|
||||
@@ -1,17 +1,15 @@
|
||||
import pytest
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_pgbench(pageserver, postgres, pg_bin, zenith_cli):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_pgbench", "empty"]);
|
||||
zenith_cli.run(["branch", "test_pgbench", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_pgbench')
|
||||
print("postgres is running on 'test_pgbench' branch")
|
||||
|
||||
connstr = pg.connstr();
|
||||
connstr = pg.connstr()
|
||||
|
||||
pg_bin.run_capture(['pgbench', '-i', connstr])
|
||||
pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr])
|
||||
|
||||
42
test_runner/batch_others/test_restart_compute.py
Normal file
42
test_runner/batch_others/test_restart_compute.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from contextlib import closing
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test restarting and recreating a postgres instance
|
||||
#
|
||||
def test_restart_compute(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_restart_compute", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_restart_compute')
|
||||
print("postgres is running on 'test_restart_compute' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Create table, and insert a row
|
||||
cur.execute('CREATE TABLE foo (t text)')
|
||||
cur.execute("INSERT INTO foo VALUES ('bar')")
|
||||
|
||||
# Stop and restart the Postgres instance
|
||||
pg.stop_and_destroy().create_start('test_restart_compute')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# We can still see the row
|
||||
cur.execute('SELECT count(*) FROM foo')
|
||||
assert cur.fetchone() == (1, )
|
||||
|
||||
# Insert another row
|
||||
cur.execute("INSERT INTO foo VALUES ('bar2')")
|
||||
cur.execute('SELECT count(*) FROM foo')
|
||||
assert cur.fetchone() == (2, )
|
||||
|
||||
# Stop, and destroy the Postgres instance. Then recreate and restart it.
|
||||
pg.stop_and_destroy().create_start('test_restart_compute')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# We can still see the rows
|
||||
cur.execute('SELECT count(*) FROM foo')
|
||||
assert cur.fetchone() == (2, )
|
||||
26
test_runner/batch_others/test_seq_scan.py
Normal file
26
test_runner/batch_others/test_seq_scan.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from contextlib import closing
|
||||
import psycopg2.extras
|
||||
import time
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Test insertion of larg number of records
|
||||
#
|
||||
# This test is pretty tightly coupled with the current implementation of page version storage
|
||||
# and garbage collection in object_repository.rs.
|
||||
#
|
||||
def test_seq_scan(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_seq_scan", "empty"])
|
||||
pg = postgres.create_start('test_seq_scan')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create table t(c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint)")
|
||||
cur.execute("insert into t values (generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000))")
|
||||
cur.execute("set max_parallel_workers_per_gather=0");
|
||||
for i in range(100):
|
||||
start = time.time()
|
||||
cur.execute("select count(*) from t");
|
||||
stop = time.time()
|
||||
print(f'Elapsed time for iterating through 1000000 records is {stop - start}')
|
||||
@@ -1,58 +1,46 @@
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test branching, when a transaction is in prepared state
|
||||
#
|
||||
import pytest
|
||||
import getpass
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def test_twophase(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_twophase", "empty"]);
|
||||
zenith_cli.run(["branch", "test_twophase", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_twophase', ['max_prepared_transactions=5'])
|
||||
pg = postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
|
||||
print("postgres is running on 'test_twophase' branch")
|
||||
|
||||
conn = psycopg2.connect(pg.connstr());
|
||||
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
conn = pg.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute('CREATE TABLE foo (t text)');
|
||||
cur.execute('CREATE TABLE foo (t text)')
|
||||
|
||||
# Prepare a transaction that will insert a row
|
||||
cur.execute('BEGIN');
|
||||
cur.execute("INSERT INTO foo VALUES ('one')");
|
||||
cur.execute("PREPARE TRANSACTION 'insert_one'");
|
||||
cur.execute('BEGIN')
|
||||
cur.execute("INSERT INTO foo VALUES ('one')")
|
||||
cur.execute("PREPARE TRANSACTION 'insert_one'")
|
||||
|
||||
# Prepare another transaction that will insert a row
|
||||
cur.execute('BEGIN');
|
||||
cur.execute("INSERT INTO foo VALUES ('two')");
|
||||
cur.execute("PREPARE TRANSACTION 'insert_two'");
|
||||
|
||||
cur.execute('BEGIN');
|
||||
cur.execute("INSERT INTO foo VALUES ('three')");
|
||||
cur.execute("PREPARE TRANSACTION 'insert_three'");
|
||||
cur.execute("COMMIT PREPARED 'insert_three'");
|
||||
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn = cur.fetchone()[0]
|
||||
cur.execute('BEGIN')
|
||||
cur.execute("INSERT INTO foo VALUES ('two')")
|
||||
cur.execute("PREPARE TRANSACTION 'insert_two'")
|
||||
|
||||
# Create a branch with the transaction in prepared state
|
||||
zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase@"+lsn]);
|
||||
zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])
|
||||
|
||||
pg2 = postgres.create_start('test_twophase_prepared', ['max_prepared_transactions=5'])
|
||||
conn2 = psycopg2.connect(pg2.connstr());
|
||||
conn2.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
pg2 = postgres.create_start('test_twophase_prepared',
|
||||
config_lines=['max_prepared_transactions=5'])
|
||||
conn2 = pg2.connect()
|
||||
cur2 = conn2.cursor()
|
||||
|
||||
# On the new branch, commit one of the prepared transactions, abort the other one.
|
||||
cur2.execute("COMMIT PREPARED 'insert_one'");
|
||||
cur2.execute("ROLLBACK PREPARED 'insert_two'");
|
||||
cur2.execute("COMMIT PREPARED 'insert_one'")
|
||||
cur2.execute("ROLLBACK PREPARED 'insert_two'")
|
||||
|
||||
cur2.execute('SELECT * FROM foo');
|
||||
assert(cur2.fetchall() == [('one',),('three',)]);
|
||||
cur2.execute('SELECT * FROM foo')
|
||||
assert cur2.fetchall() == [('one', )]
|
||||
|
||||
# Neither insert is visible on the original branch, the transactions are still
|
||||
# in prepared state there.
|
||||
cur.execute('SELECT * FROM foo');
|
||||
assert(cur.fetchall() == [('three',)]);
|
||||
cur.execute('SELECT * FROM foo')
|
||||
assert cur.fetchall() == []
|
||||
|
||||
199
test_runner/batch_others/test_wal_acceptor.py
Normal file
199
test_runner/batch_others/test_wal_acceptor.py
Normal file
@@ -0,0 +1,199 @@
|
||||
import pytest
|
||||
import random
|
||||
import time
|
||||
|
||||
from contextlib import closing
|
||||
from multiprocessing import Process, Value
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
# basic test, write something in setup with wal acceptors, ensure that commits
|
||||
# succeed and data is written
|
||||
def test_normal_work(zenith_cli, pageserver, postgres, wa_factory):
|
||||
zenith_cli.run(["branch", "test_wal_acceptors_normal_work", "empty"])
|
||||
wa_factory.start_n_new(3)
|
||||
pg = postgres.create_start('test_wal_acceptors_normal_work',
|
||||
wal_acceptors=wa_factory.get_connstrs())
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# we rely upon autocommit after each statement
|
||||
# as waiting for acceptors happens there
|
||||
cur.execute('CREATE TABLE t(key int primary key, value text)')
|
||||
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
assert cur.fetchone() == (5000050000, )
|
||||
|
||||
|
||||
# Run page server and multiple acceptors, and multiple compute nodes running
|
||||
# against different timelines.
|
||||
def test_many_timelines(zenith_cli, pageserver, postgres, wa_factory):
|
||||
n_timelines = 2
|
||||
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]
|
||||
|
||||
# start postgres on each timeline
|
||||
pgs = []
|
||||
for branch in branches:
|
||||
zenith_cli.run(["branch", branch, "empty"])
|
||||
pgs.append(postgres.create_start(branch, wal_acceptors=wa_factory.get_connstrs()))
|
||||
|
||||
# Do everything in different loops to have actions on different timelines
|
||||
# interleaved.
|
||||
# create schema
|
||||
for pg in pgs:
|
||||
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
||||
|
||||
# Populate data
|
||||
for pg in pgs:
|
||||
pg.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
||||
|
||||
# Check data
|
||||
for pg in pgs:
|
||||
res = pg.safe_psql("SELECT sum(key) FROM t")
|
||||
assert res[0] == (5000050000, )
|
||||
|
||||
|
||||
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
|
||||
# times, with fault_probability chance of getting a wal acceptor down or up
|
||||
# along the way. 2 of 3 are always alive, so the work keeps going.
|
||||
def test_restarts(zenith_cli, pageserver, postgres, wa_factory):
|
||||
fault_probability = 0.01
|
||||
n_inserts = 1000
|
||||
n_acceptors = 3
|
||||
|
||||
wa_factory.start_n_new(n_acceptors)
|
||||
|
||||
zenith_cli.run(["branch", "test_wal_acceptors_restarts", "empty"])
|
||||
pg = postgres.create_start('test_wal_acceptors_restarts',
|
||||
wal_acceptors=wa_factory.get_connstrs())
|
||||
|
||||
# we rely upon autocommit after each statement
|
||||
# as waiting for acceptors happens there
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
failed_node = None
|
||||
cur.execute('CREATE TABLE t(key int primary key, value text)')
|
||||
for i in range(n_inserts):
|
||||
cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1, ))
|
||||
|
||||
if random.random() <= fault_probability:
|
||||
if failed_node is None:
|
||||
failed_node = wa_factory.instances[random.randrange(0, n_acceptors)]
|
||||
failed_node.stop()
|
||||
else:
|
||||
failed_node.start()
|
||||
failed_node = None
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
assert cur.fetchone() == (500500, )
|
||||
|
||||
|
||||
start_delay_sec = 2
|
||||
|
||||
|
||||
def delayed_wal_acceptor_start(wa):
|
||||
time.sleep(start_delay_sec)
|
||||
wa.start()
|
||||
|
||||
|
||||
# When majority of acceptors is offline, commits are expected to be frozen
|
||||
def test_unavailability(zenith_cli, pageserver, postgres, wa_factory):
|
||||
wa_factory.start_n_new(2)
|
||||
|
||||
zenith_cli.run(["branch", "test_wal_acceptors_unavailability", "empty"])
|
||||
pg = postgres.create_start('test_wal_acceptors_unavailability',
|
||||
wal_acceptors=wa_factory.get_connstrs())
|
||||
|
||||
# we rely upon autocommit after each statement
|
||||
# as waiting for acceptors happens there
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
# check basic work with table
|
||||
cur.execute('CREATE TABLE t(key int primary key, value text)')
|
||||
cur.execute("INSERT INTO t values (1, 'payload')")
|
||||
|
||||
# shutdown one of two acceptors, that is, majority
|
||||
wa_factory.instances[0].stop()
|
||||
|
||||
proc = Process(target=delayed_wal_acceptor_start, args=(wa_factory.instances[0], ))
|
||||
proc.start()
|
||||
|
||||
start = time.time()
|
||||
cur.execute("INSERT INTO t values (2, 'payload')")
|
||||
# ensure that the query above was hanging while acceptor was down
|
||||
assert (time.time() - start) >= start_delay_sec
|
||||
proc.join()
|
||||
|
||||
# for the world's balance, do the same with second acceptor
|
||||
wa_factory.instances[1].stop()
|
||||
|
||||
proc = Process(target=delayed_wal_acceptor_start, args=(wa_factory.instances[1], ))
|
||||
proc.start()
|
||||
|
||||
start = time.time()
|
||||
cur.execute("INSERT INTO t values (3, 'payload')")
|
||||
# ensure that the query above was hanging while acceptor was down
|
||||
assert (time.time() - start) >= start_delay_sec
|
||||
proc.join()
|
||||
|
||||
cur.execute("INSERT INTO t values (4, 'payload')")
|
||||
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
assert cur.fetchone() == (10, )
|
||||
|
||||
|
||||
# shut down random subset of acceptors, sleep, wake them up, rinse, repeat
|
||||
def xmas_garland(acceptors, stop):
|
||||
while not bool(stop.value):
|
||||
victims = []
|
||||
for wa in acceptors:
|
||||
if random.random() >= 0.5:
|
||||
victims.append(wa)
|
||||
for v in victims:
|
||||
v.stop()
|
||||
time.sleep(1)
|
||||
for v in victims:
|
||||
v.start()
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
# value which gets unset on exit
|
||||
@pytest.fixture
|
||||
def stop_value():
|
||||
stop = Value('i', 0)
|
||||
yield stop
|
||||
stop.value = 1
|
||||
|
||||
|
||||
# do inserts while concurrently getting up/down subsets of acceptors
|
||||
def test_race_conditions(zenith_cli, pageserver, postgres, wa_factory, stop_value):
|
||||
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
zenith_cli.run(["branch", "test_wal_acceptors_race_conditions", "empty"])
|
||||
pg = postgres.create_start('test_wal_acceptors_race_conditions',
|
||||
wal_acceptors=wa_factory.get_connstrs())
|
||||
|
||||
# we rely upon autocommit after each statement
|
||||
# as waiting for acceptors happens there
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
cur.execute('CREATE TABLE t(key int primary key, value text)')
|
||||
|
||||
proc = Process(target=xmas_garland, args=(wa_factory.instances, stop_value))
|
||||
proc.start()
|
||||
|
||||
for i in range(1000):
|
||||
cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1, ))
|
||||
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
assert cur.fetchone() == (500500, )
|
||||
|
||||
stop_value.value = 1
|
||||
proc.join()
|
||||
@@ -1,49 +1,48 @@
|
||||
import pytest
|
||||
import psycopg2
|
||||
import json
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def helper_compare_branch_list(page_server_cur, zenith_cli):
|
||||
"""
|
||||
Compare branches list returned by CLI and directly via API.
|
||||
Filters out branches created by other tests.
|
||||
"""
|
||||
|
||||
page_server_cur.execute('branch_list;')
|
||||
page_server_cur.execute('branch_list')
|
||||
branches_api = sorted(map(lambda b: b['name'], json.loads(page_server_cur.fetchone()[0])))
|
||||
branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]
|
||||
|
||||
res = zenith_cli.run(["branch"]);
|
||||
assert(res.stderr == '')
|
||||
res = zenith_cli.run(["branch"])
|
||||
assert res.stderr == ''
|
||||
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
|
||||
branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
|
||||
|
||||
assert(branches_api == branches_cli)
|
||||
assert branches_api == branches_cli
|
||||
|
||||
|
||||
def test_cli_branch_list(pageserver, zenith_cli):
|
||||
|
||||
page_server_conn = psycopg2.connect(pageserver.connstr())
|
||||
page_server_conn.autocommit = True
|
||||
page_server_conn = pageserver.connect()
|
||||
page_server_cur = page_server_conn.cursor()
|
||||
|
||||
# Initial sanity check
|
||||
helper_compare_branch_list(page_server_cur, zenith_cli)
|
||||
|
||||
# Create a branch for us
|
||||
res = zenith_cli.run(["branch", "test_cli_branch_list_main", "main"]);
|
||||
assert(res.stderr == '')
|
||||
res = zenith_cli.run(["branch", "test_cli_branch_list_main", "main"])
|
||||
assert res.stderr == ''
|
||||
helper_compare_branch_list(page_server_cur, zenith_cli)
|
||||
|
||||
# Create a nested branch
|
||||
res = zenith_cli.run(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"]);
|
||||
assert(res.stderr == '')
|
||||
res = zenith_cli.run(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"])
|
||||
assert res.stderr == ''
|
||||
helper_compare_branch_list(page_server_cur, zenith_cli)
|
||||
|
||||
# Check that all new branches are visible via CLI
|
||||
res = zenith_cli.run(["branch"]);
|
||||
assert(res.stderr == '')
|
||||
res = zenith_cli.run(["branch"])
|
||||
assert res.stderr == ''
|
||||
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
|
||||
|
||||
assert('test_cli_branch_list_main' in branches_cli)
|
||||
assert('test_cli_branch_list_nested' in branches_cli)
|
||||
assert 'test_cli_branch_list_main' in branches_cli
|
||||
assert 'test_cli_branch_list_nested' in branches_cli
|
||||
|
||||
50
test_runner/batch_pg_regress/test_isolation.py
Normal file
50
test_runner/batch_pg_regress/test_isolation.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import os
|
||||
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_isolation(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
|
||||
base_dir, capsys):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_isolation", "empty"])
|
||||
|
||||
# Connect to postgres and create a database called "regression".
|
||||
# isolation tests use prepared transactions, so enable them
|
||||
pg = postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100'])
|
||||
pg.safe_psql('CREATE DATABASE isolation_regression')
|
||||
|
||||
# Create some local directories for pg_isolation_regress to run in.
|
||||
runpath = os.path.join(test_output_dir, 'regress')
|
||||
mkdir_if_needed(runpath)
|
||||
mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
|
||||
|
||||
# Compute all the file locations that pg_isolation_regress will need.
|
||||
build_path = os.path.join(pg_distrib_dir, 'build/src/test/isolation')
|
||||
src_path = os.path.join(base_dir, 'vendor/postgres/src/test/isolation')
|
||||
bindir = os.path.join(pg_distrib_dir, 'bin')
|
||||
schedule = os.path.join(src_path, 'isolation_schedule')
|
||||
pg_isolation_regress = os.path.join(build_path, 'pg_isolation_regress')
|
||||
|
||||
pg_isolation_regress_command = [
|
||||
pg_isolation_regress,
|
||||
'--use-existing',
|
||||
'--bindir={}'.format(bindir),
|
||||
'--dlpath={}'.format(build_path),
|
||||
'--inputdir={}'.format(src_path),
|
||||
'--schedule={}'.format(schedule),
|
||||
]
|
||||
|
||||
env = {
|
||||
'PGPORT': str(pg.port),
|
||||
'PGUSER': pg.username,
|
||||
'PGHOST': pg.host,
|
||||
}
|
||||
|
||||
# Run the command.
|
||||
# We don't capture the output. It's not too chatty, and it always
|
||||
# logs the exact same data to `regression.out` anyway.
|
||||
with capsys.disabled():
|
||||
pg_bin.run(pg_isolation_regress_command, env=env, cwd=runpath)
|
||||
@@ -1,28 +1,19 @@
|
||||
import pytest
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
import getpass
|
||||
import os
|
||||
import psycopg2
|
||||
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
# FIXME: put host + port in a fixture
|
||||
HOST = 'localhost'
|
||||
PORT = 55432
|
||||
|
||||
|
||||
def test_pg_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, base_dir, capsys):
|
||||
def test_pg_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
|
||||
base_dir, capsys):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_pg_regress", "empty"]);
|
||||
zenith_cli.run(["branch", "test_pg_regress", "empty"])
|
||||
|
||||
# Connect to postgres and create a database called "regression".
|
||||
pg = postgres.create_start('test_pg_regress')
|
||||
pg_conn = psycopg2.connect(pg.connstr())
|
||||
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute('CREATE DATABASE regression')
|
||||
pg_conn.close()
|
||||
pg.safe_psql('CREATE DATABASE regression')
|
||||
|
||||
# Create some local directories for pg_regress to run in.
|
||||
runpath = os.path.join(test_output_dir, 'regress')
|
||||
@@ -30,10 +21,8 @@ def test_pg_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, p
|
||||
mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
|
||||
|
||||
# Compute all the file locations that pg_regress will need.
|
||||
build_path = os.path.join(
|
||||
pg_distrib_dir, 'build/src/test/regress')
|
||||
src_path = os.path.join(
|
||||
base_dir, 'vendor/postgres/src/test/regress')
|
||||
build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress')
|
||||
src_path = os.path.join(base_dir, 'vendor/postgres/src/test/regress')
|
||||
bindir = os.path.join(pg_distrib_dir, 'bin')
|
||||
schedule = os.path.join(src_path, 'parallel_schedule')
|
||||
pg_regress = os.path.join(build_path, 'pg_regress')
|
||||
|
||||
@@ -1,28 +1,19 @@
|
||||
import pytest
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
import getpass
|
||||
import os
|
||||
import psycopg2
|
||||
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
# FIXME: put host + port in a fixture
|
||||
HOST = 'localhost'
|
||||
PORT = 55432
|
||||
|
||||
|
||||
def test_zenith_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, base_dir, capsys):
|
||||
def test_zenith_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
|
||||
base_dir, capsys):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_zenith_regress", "empty"]);
|
||||
zenith_cli.run(["branch", "test_zenith_regress", "empty"])
|
||||
|
||||
# Connect to postgres and create a database called "regression".
|
||||
pg = postgres.create_start('test_zenith_regress')
|
||||
pg_conn = psycopg2.connect(pg.connstr())
|
||||
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute('CREATE DATABASE regression')
|
||||
pg_conn.close()
|
||||
pg.safe_psql('CREATE DATABASE regression')
|
||||
|
||||
# Create some local directories for pg_regress to run in.
|
||||
runpath = os.path.join(test_output_dir, 'regress')
|
||||
@@ -31,10 +22,8 @@ def test_zenith_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_di
|
||||
|
||||
# Compute all the file locations that pg_regress will need.
|
||||
# This test runs zenith specific tests
|
||||
build_path = os.path.join(
|
||||
pg_distrib_dir, 'build/src/test/regress')
|
||||
src_path = os.path.join(
|
||||
base_dir, 'test_runner/zenith_regress')
|
||||
build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress')
|
||||
src_path = os.path.join(base_dir, 'test_runner/zenith_regress')
|
||||
bindir = os.path.join(pg_distrib_dir, 'bin')
|
||||
schedule = os.path.join(src_path, 'parallel_schedule')
|
||||
pg_regress = os.path.join(build_path, 'pg_regress')
|
||||
@@ -1,13 +1,15 @@
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
def get_self_dir():
|
||||
from typing import Any, List
|
||||
|
||||
|
||||
def get_self_dir() -> str:
|
||||
""" Get the path to the directory where this script lives. """
|
||||
return os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
def mkdir_if_needed(path):
|
||||
def mkdir_if_needed(path: str) -> None:
|
||||
""" Create a directory if it doesn't already exist
|
||||
|
||||
Note this won't try to create intermediate directories.
|
||||
@@ -18,7 +20,7 @@ def mkdir_if_needed(path):
|
||||
os.mkdir(path)
|
||||
|
||||
|
||||
def subprocess_capture(capture_dir, cmd, **kwargs):
|
||||
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:
|
||||
""" Run a process and capture its output
|
||||
|
||||
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
|
||||
@@ -42,7 +44,7 @@ def subprocess_capture(capture_dir, cmd, **kwargs):
|
||||
_global_counter = 0
|
||||
|
||||
|
||||
def global_counter():
|
||||
def global_counter() -> int:
|
||||
""" A really dumb global counter.
|
||||
|
||||
This is useful for giving output files a unique number, so if we run the
|
||||
|
||||
@@ -3,11 +3,18 @@ import os
|
||||
import psycopg2
|
||||
import pytest
|
||||
import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
from .utils import (get_self_dir, mkdir_if_needed,
|
||||
subprocess_capture, global_counter)
|
||||
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
|
||||
# Type-related stuff
|
||||
from psycopg2.extensions import connection as PgConnection
|
||||
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
|
||||
from typing_extensions import Literal
|
||||
|
||||
from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
|
||||
"""
|
||||
This file contains pytest fixtures. A fixture is a test resource that can be
|
||||
summoned by placing its name in the test's arguments.
|
||||
@@ -20,22 +27,28 @@ ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information.
|
||||
|
||||
To use fixtures in a test file, add this line of code:
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
>>> pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
Don't import functions from this file, or pytest will emit warnings. Instead
|
||||
put directly-importable functions into utils.py or another separate file.
|
||||
"""
|
||||
|
||||
Env = Dict[str, str]
|
||||
Fn = TypeVar('Fn', bound=Callable[..., Any])
|
||||
|
||||
DEFAULT_OUTPUT_DIR = 'test_output'
|
||||
DEFAULT_POSTGRES_DIR = 'tmp_install'
|
||||
|
||||
DEFAULT_PAGESERVER_PORT = 64000
|
||||
|
||||
def determine_scope(fixture_name, config):
|
||||
|
||||
def determine_scope(fixture_name: str, config: Any) -> str:
|
||||
return 'session'
|
||||
|
||||
|
||||
def zenfixture(func):
|
||||
""" This is a python decorator for fixtures with a flexible scope.
|
||||
def zenfixture(func: Fn) -> Fn:
|
||||
"""
|
||||
This is a python decorator for fixtures with a flexible scope.
|
||||
|
||||
By default every test function will set up and tear down a new
|
||||
database. In pytest, this is called fixtures "function" scope.
|
||||
@@ -43,18 +56,18 @@ def zenfixture(func):
|
||||
If the environment variable TEST_SHARED_FIXTURES is set, then all
|
||||
tests will share the same database. State, logs, etc. will be
|
||||
stored in a directory called "shared".
|
||||
|
||||
"""
|
||||
if os.environ.get('TEST_SHARED_FIXTURES') is None:
|
||||
scope = 'function'
|
||||
else:
|
||||
scope = 'session'
|
||||
|
||||
scope: Literal['session', 'function'] = \
|
||||
'function' if os.environ.get('TEST_SHARED_FIXTURES') is None else 'session'
|
||||
|
||||
return pytest.fixture(func, scope=scope)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True, scope='session')
|
||||
def safety_check():
|
||||
def safety_check() -> None:
|
||||
""" Ensure that no unwanted daemons are running before we start testing. """
|
||||
|
||||
# does not use -c as it is not supported on macOS
|
||||
cmd = ['pgrep', 'pageserver|postgres|wal_acceptor']
|
||||
result = subprocess.run(cmd, stdout=subprocess.DEVNULL)
|
||||
@@ -65,14 +78,56 @@ def safety_check():
|
||||
raise Exception('found interfering processes running')
|
||||
|
||||
|
||||
class PgProtocol:
|
||||
""" Reusable connection logic """
|
||||
def __init__(self, host: str, port: int, username: Optional[str] = None):
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.username = username or getpass.getuser()
|
||||
|
||||
def connstr(self, *, dbname: str = 'postgres', username: Optional[str] = None) -> str:
|
||||
"""
|
||||
Build a libpq connection string for the Postgres instance.
|
||||
"""
|
||||
|
||||
username = username or self.username
|
||||
return f'host={self.host} port={self.port} user={username} dbname={dbname}'
|
||||
|
||||
# autocommit=True here by default because that's what we need most of the time
|
||||
def connect(self, *, autocommit=True, **kwargs: Any) -> PgConnection:
|
||||
"""
|
||||
Connect to the node.
|
||||
Returns psycopg2's connection object.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
|
||||
conn = psycopg2.connect(self.connstr(**kwargs))
|
||||
# WARNING: this setting affects *all* tests!
|
||||
conn.autocommit = autocommit
|
||||
return conn
|
||||
|
||||
def safe_psql(self, query: str, **kwargs: Any) -> List[Any]:
|
||||
"""
|
||||
Execute query against the node and return all rows.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
|
||||
with closing(self.connect(**kwargs)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(query)
|
||||
if cur.description is None:
|
||||
return [] # query didn't return data
|
||||
return cast(List[Any], cur.fetchall())
|
||||
|
||||
|
||||
class ZenithCli:
|
||||
""" An object representing the CLI binary named "zenith".
|
||||
"""
|
||||
An object representing the CLI binary named "zenith".
|
||||
|
||||
We also store an environment that will tell the CLI to operate
|
||||
on a particular ZENITH_REPO_DIR.
|
||||
"""
|
||||
|
||||
def __init__(self, binpath, repo_dir, pg_distrib_dir):
|
||||
def __init__(self, binpath: str, repo_dir: str, pg_distrib_dir: str):
|
||||
assert os.path.isdir(binpath)
|
||||
self.binpath = binpath
|
||||
self.bin_zenith = os.path.join(binpath, 'zenith')
|
||||
@@ -80,148 +135,259 @@ class ZenithCli:
|
||||
self.env['ZENITH_REPO_DIR'] = repo_dir
|
||||
self.env['POSTGRES_DISTRIB_DIR'] = pg_distrib_dir
|
||||
|
||||
def run(self, arguments):
|
||||
""" Run "zenith" with the specified arguments.
|
||||
def run(self, arguments: List[str]) -> Any:
|
||||
"""
|
||||
Run "zenith" with the specified arguments.
|
||||
|
||||
arguments must be in list form, e.g. ['pg', 'create']
|
||||
Arguments must be in list form, e.g. ['pg', 'create']
|
||||
|
||||
Return both stdout and stderr, which can be accessed as
|
||||
|
||||
result = zenith_cli.run(...)
|
||||
assert(result.stderr == "")
|
||||
print(result.stdout)
|
||||
|
||||
>>> result = zenith_cli.run(...)
|
||||
>>> assert result.stderr == ""
|
||||
>>> print(result.stdout)
|
||||
"""
|
||||
|
||||
assert type(arguments) == list
|
||||
|
||||
args = [self.bin_zenith] + arguments
|
||||
print('Running command "{}"'.format(' '.join(args)))
|
||||
return subprocess.run(args, env=self.env, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
return subprocess.run(args,
|
||||
env=self.env,
|
||||
check=True,
|
||||
universal_newlines=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
|
||||
|
||||
@zenfixture
|
||||
def zenith_cli(zenith_binpath, repo_dir, pg_distrib_dir):
|
||||
def zenith_cli(zenith_binpath: str, repo_dir: str, pg_distrib_dir: str) -> ZenithCli:
|
||||
return ZenithCli(zenith_binpath, repo_dir, pg_distrib_dir)
|
||||
|
||||
|
||||
class ZenithPageserver:
|
||||
class ZenithPageserver(PgProtocol):
|
||||
""" An object representing a running pageserver. """
|
||||
def __init__(self, zenith_cli: ZenithCli):
|
||||
super().__init__(host='localhost', port=DEFAULT_PAGESERVER_PORT)
|
||||
|
||||
def __init__(self, zenith_cli):
|
||||
self.zenith_cli = zenith_cli
|
||||
self.running = False
|
||||
|
||||
# Initialize the repository, i.e. run "zenith init"
|
||||
def init(self):
|
||||
self.zenith_cli.run(['init'])
|
||||
def init(self) -> 'ZenithPageserver':
|
||||
"""
|
||||
Initialize the repository, i.e. run "zenith init".
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
self.zenith_cli.run(['init'])
|
||||
return self
|
||||
|
||||
def start(self) -> 'ZenithPageserver':
|
||||
"""
|
||||
Start the page server.
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
# Start the page server
|
||||
def start(self):
|
||||
self.zenith_cli.run(['start'])
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
# Stop the page server
|
||||
def stop(self):
|
||||
self.zenith_cli.run(['stop'])
|
||||
self.running = True
|
||||
def stop(self) -> 'ZenithPageserver':
|
||||
"""
|
||||
Stop the page server.
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
if self.running:
|
||||
self.zenith_cli.run(['stop'])
|
||||
self.running = False
|
||||
|
||||
return self
|
||||
|
||||
# The page server speaks the Postgres FE/BE protocol, so you can connect
|
||||
# to it with any Postgres client, and run special commands. This function
|
||||
# returns a libpq connection string for connecting to it.
|
||||
def connstr(self):
|
||||
username = getpass.getuser()
|
||||
conn_str = 'host={} port={} dbname=postgres user={}'.format(
|
||||
'localhost', 64000, username)
|
||||
return conn_str
|
||||
|
||||
# The 'pageserver' fixture provides a Page Server that's up and running.
|
||||
#
|
||||
# If TEST_SHARED_FIXTURES is set, the Page Server instance is shared by all
|
||||
# the tests. To avoid clashing with other tests, don't use the 'main' branch in
|
||||
# the tests directly. Instead, create a branch off the 'empty' branch and use
|
||||
# that.
|
||||
#
|
||||
# By convention, the test branches are named after the tests. For example,
|
||||
# test called 'test_foo' would create and use branches with the 'test_foo' prefix.
|
||||
@zenfixture
|
||||
def pageserver(zenith_cli):
|
||||
ps = ZenithPageserver(zenith_cli)
|
||||
ps.init()
|
||||
ps.start()
|
||||
def pageserver(zenith_cli: ZenithCli) -> Iterator[ZenithPageserver]:
|
||||
"""
|
||||
The 'pageserver' fixture provides a Page Server that's up and running.
|
||||
|
||||
If TEST_SHARED_FIXTURES is set, the Page Server instance is shared by all
|
||||
the tests. To avoid clashing with other tests, don't use the 'main' branch in
|
||||
the tests directly. Instead, create a branch off the 'empty' branch and use
|
||||
that.
|
||||
|
||||
By convention, the test branches are named after the tests. For example,
|
||||
test called 'test_foo' would create and use branches with the 'test_foo' prefix.
|
||||
"""
|
||||
|
||||
ps = ZenithPageserver(zenith_cli).init().start()
|
||||
# For convenience in tests, create a branch from the freshly-initialized cluster.
|
||||
zenith_cli.run(["branch", "empty", "main"]);
|
||||
zenith_cli.run(["branch", "empty", "main"])
|
||||
|
||||
yield ps
|
||||
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting pageserver cleanup')
|
||||
ps.stop()
|
||||
|
||||
class Postgres:
|
||||
""" An object representing a running postgres daemon. """
|
||||
|
||||
def __init__(self, zenith_cli, repo_dir, instance_num):
|
||||
class Postgres(PgProtocol):
|
||||
""" An object representing a running postgres daemon. """
|
||||
def __init__(self, zenith_cli: ZenithCli, repo_dir: str, instance_num: int):
|
||||
super().__init__(host='localhost', port=55431 + instance_num)
|
||||
|
||||
self.zenith_cli = zenith_cli
|
||||
self.instance_num = instance_num
|
||||
self.running = False
|
||||
self.username = getpass.getuser()
|
||||
self.host = 'localhost'
|
||||
self.port = 55431 + instance_num
|
||||
self.repo_dir = repo_dir
|
||||
self.branch = None
|
||||
self.branch: Optional[str] = None # dubious, see asserts below
|
||||
# path to conf is <repo_dir>/pgdatadirs/<branch_name>/postgresql.conf
|
||||
|
||||
def create_start(self, branch, config_lines=None):
|
||||
""" create the pg data directory, and start the server """
|
||||
def create(self,
|
||||
branch: str,
|
||||
wal_acceptors: Optional[str] = None,
|
||||
config_lines: Optional[List[str]] = None) -> 'Postgres':
|
||||
"""
|
||||
Create the pg data directory.
|
||||
If wal_acceptors is not None, node will use wal acceptors; config is
|
||||
adjusted accordingly.
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
if not config_lines:
|
||||
config_lines = []
|
||||
|
||||
self.zenith_cli.run(['pg', 'create', branch])
|
||||
self.branch = branch
|
||||
if wal_acceptors is not None:
|
||||
self.adjust_for_wal_acceptors(wal_acceptors)
|
||||
if config_lines is None:
|
||||
config_lines = []
|
||||
self.config(config_lines)
|
||||
self.zenith_cli.run(['pg', 'start', branch])
|
||||
self.running = True
|
||||
return
|
||||
|
||||
#lines should be an array of valid postgresql.conf rows
|
||||
def config(self, lines):
|
||||
filename = 'pgdatadirs/{}/postgresql.conf'.format(self.branch)
|
||||
config_name = os.path.join(self.repo_dir, filename)
|
||||
with open(config_name, 'a') as conf:
|
||||
return self
|
||||
|
||||
def start(self) -> 'Postgres':
|
||||
"""
|
||||
Start the Postgres instance.
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
assert self.branch is not None
|
||||
self.zenith_cli.run(['pg', 'start', self.branch])
|
||||
self.running = True
|
||||
|
||||
return self
|
||||
|
||||
def config_file_path(self) -> str:
|
||||
""" Path to postgresql.conf """
|
||||
filename = f'pgdatadirs/{self.branch}/postgresql.conf'
|
||||
return os.path.join(self.repo_dir, filename)
|
||||
|
||||
def adjust_for_wal_acceptors(self, wal_acceptors: str) -> 'Postgres':
|
||||
"""
|
||||
Adjust instance config for working with wal acceptors instead of
|
||||
pageserver (pre-configured by CLI) directly.
|
||||
"""
|
||||
|
||||
# TODO: reuse config()
|
||||
with open(self.config_file_path(), "r") as f:
|
||||
cfg_lines = f.readlines()
|
||||
with open(self.config_file_path(), "w") as f:
|
||||
for cfg_line in cfg_lines:
|
||||
# walproposer uses different application_name
|
||||
if ("synchronous_standby_names" in cfg_line or
|
||||
# don't ask pageserver to fetch WAL from compute
|
||||
"callmemaybe_connstring" in cfg_line):
|
||||
continue
|
||||
f.write(cfg_line)
|
||||
f.write("synchronous_standby_names = 'walproposer'\n")
|
||||
f.write("wal_acceptors = '{}'\n".format(wal_acceptors))
|
||||
return self
|
||||
|
||||
def config(self, lines: List[str]) -> 'Postgres':
|
||||
"""
|
||||
Add lines to postgresql.conf.
|
||||
Lines should be an array of valid postgresql.conf rows.
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
with open(self.config_file_path(), 'a') as conf:
|
||||
for line in lines:
|
||||
conf.write(line)
|
||||
conf.write('\n')
|
||||
|
||||
def stop(self):
|
||||
if self.running:
|
||||
self.zenith_cli.run(['pg', 'stop', self.branch])
|
||||
return self
|
||||
|
||||
def stop(self) -> 'Postgres':
|
||||
"""
|
||||
Stop the Postgres instance if it's running.
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
if self.running:
|
||||
assert self.branch is not None
|
||||
self.zenith_cli.run(['pg', 'stop', self.branch])
|
||||
self.running = False
|
||||
|
||||
return self
|
||||
|
||||
def stop_and_destroy(self) -> 'Postgres':
|
||||
"""
|
||||
Stop the Postgres instance, then destroy it.
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
assert self.branch is not None
|
||||
self.zenith_cli.run(['pg', 'stop', '--destroy', self.branch])
|
||||
|
||||
return self
|
||||
|
||||
def create_start(self,
|
||||
branch: str,
|
||||
wal_acceptors: Optional[str] = None,
|
||||
config_lines: Optional[List[str]] = None) -> 'Postgres':
|
||||
"""
|
||||
Create a Postgres instance, then start it.
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
self.create(branch, wal_acceptors, config_lines).start()
|
||||
|
||||
return self
|
||||
|
||||
# Return a libpq connection string to connect to the Postgres instance
|
||||
def connstr(self, dbname='postgres'):
|
||||
conn_str = 'host={} port={} dbname={} user={}'.format(
|
||||
self.host, self.port, dbname, self.username)
|
||||
return conn_str
|
||||
|
||||
class PostgresFactory:
|
||||
""" An object representing multiple running postgres daemons. """
|
||||
def __init__(self, zenith_cli, repo_dir):
|
||||
def __init__(self, zenith_cli: ZenithCli, repo_dir: str):
|
||||
self.zenith_cli = zenith_cli
|
||||
self.host = 'localhost'
|
||||
self.repo_dir = repo_dir
|
||||
self.num_instances = 0
|
||||
self.instances = []
|
||||
self.instances: List[Postgres] = []
|
||||
|
||||
def create_start(self,
|
||||
branch: str = "main",
|
||||
wal_acceptors: Optional[str] = None,
|
||||
config_lines: Optional[List[str]] = None) -> Postgres:
|
||||
|
||||
def create_start(self, branch="main", config_lines=None):
|
||||
pg = Postgres(self.zenith_cli, self.repo_dir, self.num_instances + 1)
|
||||
self.num_instances += 1
|
||||
self.instances.append(pg)
|
||||
pg.create_start(branch, config_lines)
|
||||
return pg
|
||||
|
||||
def stop_all(self):
|
||||
return pg.create_start(branch, wal_acceptors, config_lines)
|
||||
|
||||
def stop_all(self) -> 'PostgresFactory':
|
||||
for pg in self.instances:
|
||||
pg.stop()
|
||||
|
||||
return self
|
||||
|
||||
|
||||
@zenfixture
|
||||
def postgres(zenith_cli, repo_dir):
|
||||
def postgres(zenith_cli: ZenithCli, repo_dir: str) -> Iterator[PostgresFactory]:
|
||||
pgfactory = PostgresFactory(zenith_cli, repo_dir)
|
||||
|
||||
yield pgfactory
|
||||
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting postgres cleanup')
|
||||
pgfactory.stop_all()
|
||||
@@ -229,27 +395,27 @@ def postgres(zenith_cli, repo_dir):
|
||||
|
||||
class PgBin:
|
||||
""" A helper class for executing postgres binaries """
|
||||
|
||||
def __init__(self, log_dir, pg_distrib_dir):
|
||||
def __init__(self, log_dir: str, pg_distrib_dir: str):
|
||||
self.log_dir = log_dir
|
||||
self.pg_install_path = pg_distrib_dir
|
||||
self.pg_bin_path = os.path.join(self.pg_install_path, 'bin')
|
||||
self.env = os.environ.copy()
|
||||
self.env['LD_LIBRARY_PATH'] = os.path.join(self.pg_install_path, 'lib')
|
||||
|
||||
def _fixpath(self, command):
|
||||
if not '/' in command[0]:
|
||||
def _fixpath(self, command: List[str]) -> None:
|
||||
if '/' not in command[0]:
|
||||
command[0] = os.path.join(self.pg_bin_path, command[0])
|
||||
|
||||
def _build_env(self, env_add):
|
||||
def _build_env(self, env_add: Optional[Env]) -> Env:
|
||||
if env_add is None:
|
||||
return self.env
|
||||
env = self.env.copy()
|
||||
env.update(env_add)
|
||||
return env
|
||||
|
||||
def run(self, command, env=None, cwd=None):
|
||||
""" Run one of the postgres binaries.
|
||||
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None) -> None:
|
||||
"""
|
||||
Run one of the postgres binaries.
|
||||
|
||||
The command should be in list form, e.g. ['pgbench', '-p', '55432']
|
||||
|
||||
@@ -259,18 +425,23 @@ class PgBin:
|
||||
characters present), then it will be edited to include the correct path.
|
||||
|
||||
If you want stdout/stderr captured to files, use `run_capture` instead.
|
||||
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
subprocess.run(command, env=env, cwd=cwd, check=True)
|
||||
|
||||
def run_capture(self, command, env=None, cwd=None):
|
||||
""" Run one of the postgres binaries, with stderr and stdout redirected to a file.
|
||||
def run_capture(self,
|
||||
command: List[str],
|
||||
env: Optional[Env] = None,
|
||||
cwd: Optional[str] = None) -> None:
|
||||
"""
|
||||
Run one of the postgres binaries, with stderr and stdout redirected to a file.
|
||||
|
||||
This is just like `run`, but for chatty programs.
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
@@ -278,21 +449,126 @@ class PgBin:
|
||||
|
||||
|
||||
@zenfixture
|
||||
def pg_bin(test_output_dir, pg_distrib_dir):
|
||||
def pg_bin(test_output_dir: str, pg_distrib_dir: str) -> PgBin:
|
||||
return PgBin(test_output_dir, pg_distrib_dir)
|
||||
|
||||
|
||||
def read_pid(path):
|
||||
""" Read content of file into number """
|
||||
return int(Path(path).read_text())
|
||||
|
||||
|
||||
class WalAcceptor:
|
||||
""" An object representing a running wal acceptor daemon. """
|
||||
def __init__(self, wa_binpath, data_dir, port, num):
|
||||
self.wa_binpath = wa_binpath
|
||||
self.data_dir = data_dir
|
||||
self.port = port
|
||||
self.num = num # identifier for logging
|
||||
|
||||
def start(self) -> 'WalAcceptor':
|
||||
# create data directory if not exists
|
||||
Path(self.data_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cmd = [self.wa_binpath]
|
||||
cmd.extend(["-D", self.data_dir])
|
||||
cmd.extend(["-l", "localhost:{}".format(self.port)])
|
||||
cmd.append("--daemonize")
|
||||
cmd.append("--no-sync")
|
||||
# Tell page server it can receive WAL from this WAL safekeeper
|
||||
cmd.extend(["--pageserver", "localhost:{}".format(DEFAULT_PAGESERVER_PORT)])
|
||||
cmd.extend(["--recall", "1 second"])
|
||||
print('Running command "{}"'.format(' '.join(cmd)))
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
return self
|
||||
|
||||
def stop(self) -> 'WalAcceptor':
|
||||
print('Stopping wal acceptor {}'.format(self.num))
|
||||
pidfile_path = os.path.join(self.data_dir, "wal_acceptor.pid")
|
||||
try:
|
||||
pid = read_pid(pidfile_path)
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except Exception:
|
||||
pass # pidfile might be obsolete
|
||||
# TODO: cleanup pid file on exit in wal acceptor
|
||||
return self
|
||||
# for _ in range(5):
|
||||
# print('waiting wal acceptor {} (pid {}) to stop...', self.num, pid)
|
||||
# try:
|
||||
# read_pid(pidfile_path)
|
||||
# except FileNotFoundError:
|
||||
# return # done
|
||||
# time.sleep(1)
|
||||
# raise Exception('Failed to wait for wal acceptor {} shutdown'.format(self.num))
|
||||
except FileNotFoundError:
|
||||
print("Wal acceptor {} is not running".format(self.num))
|
||||
return self
|
||||
|
||||
|
||||
class WalAcceptorFactory:
|
||||
""" An object representing multiple running wal acceptors. """
|
||||
def __init__(self, zenith_binpath, data_dir):
|
||||
self.wa_binpath = os.path.join(zenith_binpath, 'wal_acceptor')
|
||||
self.data_dir = data_dir
|
||||
self.instances = []
|
||||
self.initial_port = 54321
|
||||
|
||||
def start_new(self) -> WalAcceptor:
|
||||
"""
|
||||
Start new wal acceptor.
|
||||
"""
|
||||
|
||||
wa_num = len(self.instances)
|
||||
wa = WalAcceptor(self.wa_binpath,
|
||||
os.path.join(self.data_dir, "wal_acceptor_{}".format(wa_num)),
|
||||
self.initial_port + wa_num, wa_num)
|
||||
wa.start()
|
||||
self.instances.append(wa)
|
||||
return wa
|
||||
|
||||
def start_n_new(self, n: int) -> None:
|
||||
"""
|
||||
Start n new wal acceptors.
|
||||
"""
|
||||
|
||||
for _ in range(n):
|
||||
self.start_new()
|
||||
|
||||
def stop_all(self) -> 'WalAcceptorFactory':
|
||||
for wa in self.instances:
|
||||
wa.stop()
|
||||
return self
|
||||
|
||||
def get_connstrs(self) -> str:
|
||||
""" Get list of wal acceptor endpoints suitable for wal_acceptors GUC """
|
||||
return ','.join(["localhost:{}".format(wa.port) for wa in self.instances])
|
||||
|
||||
|
||||
@zenfixture
|
||||
def base_dir():
|
||||
def wa_factory(zenith_binpath: str, repo_dir: str) -> Iterator[WalAcceptorFactory]:
|
||||
""" Gives WalAcceptorFactory providing wal acceptors. """
|
||||
wafactory = WalAcceptorFactory(zenith_binpath, os.path.join(repo_dir, "wal_acceptors"))
|
||||
yield wafactory
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting wal acceptors cleanup')
|
||||
wafactory.stop_all()
|
||||
|
||||
|
||||
@zenfixture
|
||||
def base_dir() -> str:
|
||||
""" find the base directory (currently this is the git root) """
|
||||
|
||||
base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..'))
|
||||
print('base_dir is', base_dir)
|
||||
print('\nbase_dir is', base_dir)
|
||||
return base_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def top_output_dir(base_dir):
|
||||
def top_output_dir(base_dir: str) -> str:
|
||||
""" Compute the top-level directory for all tests. """
|
||||
|
||||
env_test_output = os.environ.get('TEST_OUTPUT')
|
||||
if env_test_output is not None:
|
||||
output_dir = env_test_output
|
||||
@@ -303,8 +579,9 @@ def top_output_dir(base_dir):
|
||||
|
||||
|
||||
@zenfixture
|
||||
def test_output_dir(request, top_output_dir):
|
||||
def test_output_dir(request: Any, top_output_dir: str) -> str:
|
||||
""" Compute the working directory for an individual test. """
|
||||
|
||||
if os.environ.get('TEST_SHARED_FIXTURES') is None:
|
||||
# one directory per test
|
||||
test_name = request.node.name
|
||||
@@ -320,19 +597,22 @@ def test_output_dir(request, top_output_dir):
|
||||
|
||||
|
||||
@zenfixture
|
||||
def repo_dir(request, test_output_dir):
|
||||
""" Compute the test repo_dir
|
||||
def repo_dir(request: Any, test_output_dir: str) -> str:
|
||||
"""
|
||||
Compute the test repo_dir.
|
||||
|
||||
"repo_dir" is the place where all of the pageserver files will go.
|
||||
It doesn't have anything to do with the git repo.
|
||||
"""
|
||||
|
||||
repo_dir = os.path.join(test_output_dir, 'repo')
|
||||
return repo_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def zenith_binpath(base_dir):
|
||||
""" find the zenith binaries """
|
||||
def zenith_binpath(base_dir: str) -> str:
|
||||
""" Find the zenith binaries. """
|
||||
|
||||
env_zenith_bin = os.environ.get('ZENITH_BIN')
|
||||
if env_zenith_bin:
|
||||
zenith_dir = env_zenith_bin
|
||||
@@ -344,8 +624,9 @@ def zenith_binpath(base_dir):
|
||||
|
||||
|
||||
@zenfixture
|
||||
def pg_distrib_dir(base_dir):
|
||||
""" find the postgress install """
|
||||
def pg_distrib_dir(base_dir: str) -> str:
|
||||
""" Find the postgres install. """
|
||||
|
||||
env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR')
|
||||
if env_postgres_bin:
|
||||
pg_dir = env_postgres_bin
|
||||
|
||||
28
test_runner/setup.cfg
Normal file
28
test_runner/setup.cfg
Normal file
@@ -0,0 +1,28 @@
|
||||
# Just trying to gather linter settings in one file.
|
||||
# I wonder if there's a way to de-duplicate them...
|
||||
|
||||
[flake8]
|
||||
max-line-length = 100
|
||||
|
||||
[pycodestyle]
|
||||
max-line-length = 100
|
||||
|
||||
[yapf]
|
||||
based_on_style = pep8
|
||||
column_limit = 100
|
||||
|
||||
[mypy]
|
||||
# some tests don't typecheck when this flag is set
|
||||
check_untyped_defs = false
|
||||
|
||||
disallow_incomplete_defs = false
|
||||
disallow_untyped_calls = false
|
||||
disallow_untyped_decorators = false
|
||||
disallow_untyped_defs = false
|
||||
strict = true
|
||||
|
||||
[mypy-psycopg2.*]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[mypy-pytest.*]
|
||||
ignore_missing_imports = true
|
||||
@@ -2,9 +2,7 @@ import pytest
|
||||
import os
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
"""
|
||||
|
||||
Use this test to see what happens when tests fail.
|
||||
|
||||
We should be able to clean up after ourselves, including stopping any
|
||||
@@ -12,21 +10,18 @@ postgres or pageserver processes.
|
||||
|
||||
Set the environment variable RUN_BROKEN to see this test run (and fail,
|
||||
and hopefully not leave any server processes behind).
|
||||
|
||||
"""
|
||||
|
||||
run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None,
|
||||
reason="only used for testing the fixtures")
|
||||
|
||||
run_broken = pytest.mark.skipif(
|
||||
os.environ.get('RUN_BROKEN') == None,
|
||||
reason="only used for testing the fixtures"
|
||||
)
|
||||
|
||||
@run_broken
|
||||
def test_broken(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_broken", "empty"]);
|
||||
zenith_cli.run(["branch", "test_broken", "empty"])
|
||||
|
||||
pg = postgres.create_start("test_broken")
|
||||
postgres.create_start("test_broken")
|
||||
print('postgres is running')
|
||||
|
||||
print('THIS NEXT COMMAND WILL FAIL:')
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
To add a new SQL test
|
||||
|
||||
- add sql script to run to zenith_regress/sql/testname.sql
|
||||
- add expected output to zenith/regress/expected/testname.out
|
||||
- add testname to both parallel_schedule and serial_schedule files*
|
||||
- add expected output to zenith_regress/expected/testname.out
|
||||
- add testname to parallel_schedule
|
||||
|
||||
That's it.
|
||||
For more complex tests see PostgreSQL regression tests. These works basically the same.
|
||||
|
||||
*it was changed recently in PostgreSQL upstream - no more separate serial_schedule.
|
||||
Someday we'll catch up with these changes.
|
||||
|
||||
@@ -17,3 +17,29 @@ SELECT * FROM truncatetest;
|
||||
(0 rows)
|
||||
|
||||
DROP TABLE truncatetest;
|
||||
--
|
||||
-- Test that the FSM is truncated along with the table.
|
||||
--
|
||||
-- Create a test table and delete and vacuum away most of the rows.
|
||||
-- This leaves the FSM full of pages with plenty of space
|
||||
create table tt(i int);
|
||||
insert into tt select g from generate_series(1, 100000) g;
|
||||
delete from tt where i%100 != 0 and i > 10000;
|
||||
vacuum freeze tt;
|
||||
-- Delete the rest of the rows, and vacuum again. This truncates the
|
||||
-- heap to 0 blocks, and should also truncate the FSM.
|
||||
delete from tt;
|
||||
vacuum tt;
|
||||
-- This can be used to look at the FSM directly, if the 'pg_freespace' contrib module
|
||||
-- is installed
|
||||
--SELECT blkno, avail from generate_series(1, 450) blkno, pg_freespace('tt'::regclass, blkno) AS avail;
|
||||
-- Insert a row again. It should go on block #0. If the FSM was not truncated,
|
||||
-- the insertion would find a higher-numbered block in the FSM and use that instead.
|
||||
insert into tt values (0);
|
||||
select ctid, * from tt;
|
||||
ctid | i
|
||||
-------+---
|
||||
(0,1) | 0
|
||||
(1 row)
|
||||
|
||||
drop table tt;
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
# ----------
|
||||
# src/test/regress/parallel_schedule
|
||||
#
|
||||
# By convention, we put no more than twenty tests in any one parallel group;
|
||||
# this limits the number of connections needed to run the tests.
|
||||
# Like in PostgreSQL src/test/regress/parallel_schedule, we put no
|
||||
# more than twenty tests in any one parallel group; this limits the
|
||||
# number of connections needed to run the tests.
|
||||
# ----------
|
||||
|
||||
test: zenith-cid
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
# src/test/regress/serial_schedule
|
||||
# This should probably be in an order similar to parallel_schedule.
|
||||
test: zenith-cid
|
||||
test: zenith-rel-truncate
|
||||
test: zenith-clog
|
||||
test: zenith-vacuum-full
|
||||
@@ -16,3 +16,31 @@ VACUUM truncatetest;
|
||||
SELECT * FROM truncatetest;
|
||||
|
||||
DROP TABLE truncatetest;
|
||||
|
||||
|
||||
--
|
||||
-- Test that the FSM is truncated along with the table.
|
||||
--
|
||||
|
||||
-- Create a test table and delete and vacuum away most of the rows.
|
||||
-- This leaves the FSM full of pages with plenty of space
|
||||
create table tt(i int);
|
||||
insert into tt select g from generate_series(1, 100000) g;
|
||||
delete from tt where i%100 != 0 and i > 10000;
|
||||
vacuum freeze tt;
|
||||
|
||||
-- Delete the rest of the rows, and vacuum again. This truncates the
|
||||
-- heap to 0 blocks, and should also truncate the FSM.
|
||||
delete from tt;
|
||||
vacuum tt;
|
||||
|
||||
-- This can be used to look at the FSM directly, if the 'pg_freespace' contrib module
|
||||
-- is installed
|
||||
--SELECT blkno, avail from generate_series(1, 450) blkno, pg_freespace('tt'::regclass, blkno) AS avail;
|
||||
|
||||
-- Insert a row again. It should go on block #0. If the FSM was not truncated,
|
||||
-- the insertion would find a higher-numbered block in the FSM and use that instead.
|
||||
insert into tt values (0);
|
||||
select ctid, * from tt;
|
||||
|
||||
drop table tt;
|
||||
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: 1ac4f5b6b1...fedd3660a2
@@ -27,7 +27,7 @@ postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
parse_duration = "2.1.1"
|
||||
humantime = "2.1.0"
|
||||
walkdir = "2"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
hex = "0.4.3"
|
||||
|
||||
@@ -12,13 +12,13 @@ safekeeper yet.
|
||||
The primary connects to the WAL safekeeper, so it works in a "push"
|
||||
fashion. That's different from how streaming replication usually
|
||||
works, where the replica initiates the connection. To do that, there
|
||||
is a component called "safekeeper_proxy". The safekeeper_proxy runs on
|
||||
the same host as the primary Postgres server and connects to it to do
|
||||
streaming replication. It also connects to the WAL safekeeper, and
|
||||
forwards all the WAL. (PostgreSQL's archive_commands works in the
|
||||
is a component called the "WAL proposer". The WAL proposer is a
|
||||
background worker that runs in the primary Postgres server. It
|
||||
connects to the WAL safekeeper, and
|
||||
sends all the WAL. (PostgreSQL's archive_commands works in the
|
||||
"push" style, but it operates on a WAL segment granularity. If
|
||||
PostgreSQL had a push style API for streaming, we wouldn't need the
|
||||
proxy).
|
||||
PostgreSQL had a push style API for streaming, WAL propose could be
|
||||
implemented using it.)
|
||||
|
||||
The Page Server connects to the WAL safekeeper, using the same
|
||||
streaming replication protocol that's used between Postgres primary
|
||||
@@ -33,6 +33,4 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only
|
||||
one primary node can be actively streaming WAL to the quorum of
|
||||
safekeepers.
|
||||
|
||||
|
||||
See vendor/postgres/src/bin/safekeeper/README.md for a more detailed
|
||||
desription of the consensus protocol. (TODO: move the text here?)
|
||||
See README.md for a more detailed desription of the consensus protocol.
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
# Proxy-safekeeper communication consensus protocol.
|
||||
# WAL proposer-safekeeper communication consensus protocol.
|
||||
|
||||
## General requirements and architecture
|
||||
|
||||
There is single stateless master and several safekeepers. Number of safekeepers is determined by redundancy level.
|
||||
To minimize number of changes in Postgres core, we are using standard streaming replication from master (through WAL sender).
|
||||
This replication stream is initiated by `safekeeper_proxy` which receives data from the master and broadcasts it to safekeepers.
|
||||
This replication stream is initiated by the WAL proposer process that runs in the PostgreSQL server, which broadcasts the WAL generated by PostgreSQL to safekeepers.
|
||||
To provide durability we use synchronous replication at master (response to the commit statement is sent to the client
|
||||
only when acknowledged by WAL receiver). `safekeeper_proxy` sends this acknowledgment only when LSN of commit record is confirmed by quorum of safekeepers.
|
||||
only when acknowledged by WAL receiver). WAL proposer sends this acknowledgment only when LSN of commit record is confirmed by quorum of safekeepers.
|
||||
|
||||
`Safekeeper_proxy` tries to establish connections with safekeepers.
|
||||
At any moment of time each safekeeper can serve exactly once proxy, but it can accept new connections.
|
||||
WAL proposer tries to establish connections with safekeepers.
|
||||
At any moment of time each safekeeper can serve exactly once proposer, but it can accept new connections.
|
||||
|
||||
Any of safekeepers can be used as WAL server, producing replication stream. So both `Pagers` and `Replicas`
|
||||
(read-only computation nodes) can connect to safekeeper to receive WAL stream. Safekeepers is streaming WAL until
|
||||
@@ -30,13 +30,13 @@ then accepts proposed nodeId and persists this choice in the local control file.
|
||||
5. If quorum of safekeepers approve proposed nodeId, then server assumes that handshake is successfully completed and switch to recovery stage.
|
||||
|
||||
## Recovery
|
||||
Proxy computes max(`restartLSN`) and max(`flushLSN`) from quorum of attached safekeepers.
|
||||
Proposer computes max(`restartLSN`) and max(`flushLSN`) from quorum of attached safekeepers.
|
||||
`RestartLSN` - is position in WAL which is known to be delivered to all safekeepers.
|
||||
In other words: `restartLSN` can be also considered as cut-off horizon (all preceding WAL segments can be removed).
|
||||
`FlushLSN` is position flushed by safekeeper to the local persistent storage.
|
||||
|
||||
If max(`restartLSN`) != max(`flushLSN`), then recovery has to be performed.
|
||||
Proxy creates replication channel with most advanced safekeeper (safekeeper with the largest `flushLSN`).
|
||||
Proposer creates replication channel with most advanced safekeeper (safekeeper with the largest `flushLSN`).
|
||||
Then it downloads all WAL messages between max(`restartLSN`)..max(`flushLSN`).
|
||||
Messages are inserted in L1-list (ordered by LSN). Then we locate position of each safekeeper in this list according
|
||||
to their `flushLSN`s. Safekeepers that are not yet connected (out of quorum) should start from the beginning of the list
|
||||
@@ -49,11 +49,11 @@ to avoid loose of committed data.
|
||||
Calculated max(`flushLSN`) is called `VCL` (Volume Complete LSN). As far as it is chosen among quorum, there may be some other offline safekeeper with larger
|
||||
`VCL`. Once it becomes online, we need to overwrite its WAL beyond `VCL`. To support it, each safekeeper maintains
|
||||
`epoch` number. `Epoch` plays almost the same role as `term`, but algorithm of `epoch` bumping is different.
|
||||
`VCL` and new epoch are received by safekeeper from proxy during voting.
|
||||
`VCL` and new epoch are received by safekeeper from proposer during voting.
|
||||
But safekeeper doesn't switch to new epoch immediately after voting.
|
||||
Instead of it, safekeepers waits record with LSN > Max(`flushLSN`,`VCL`) is received.
|
||||
It means that we restore all records from old generation and switch to new generation.
|
||||
When proxy calculates max(`FlushLSN`), it first compares `Epoch`. So actually we compare (`Epoch`,`FlushLSN`) pairs.
|
||||
When proposer calculates max(`FlushLSN`), it first compares `Epoch`. So actually we compare (`Epoch`,`FlushLSN`) pairs.
|
||||
|
||||
Let's looks at the examples. Consider that we have three safekeepers: S1, S2, S3. Si(N) means that i-th safekeeper has epoch=N.
|
||||
Ri(x) - WAL record for resource X with LSN=i. Assume that we have the following state:
|
||||
@@ -64,7 +64,7 @@ S2(1): R1(a),R2(b)
|
||||
S3(1): R1(a),R2(b),R3(c),R4(d) - offline
|
||||
```
|
||||
|
||||
Proxy choose quorum (S1,S2). VCL for them is 2. We download S2 to proxy and schedule its write to S1.
|
||||
Proposer choose quorum (S1,S2). VCL for them is 2. We download S2 to proposer and schedule its write to S1.
|
||||
After receiving record R5 the picture can be:
|
||||
|
||||
```
|
||||
@@ -119,29 +119,26 @@ S3(1): R1(a),R2(b),R3(c),R4(d)
|
||||
```
|
||||
|
||||
## Main loop
|
||||
Once recovery is completed, proxy switches to normal processing loop: it receives WAL stream from master and appends WAL
|
||||
Once recovery is completed, proposer switches to normal processing loop: it receives WAL stream from Postgres and appends WAL
|
||||
messages to the list. At the same time it tries to push messages to safekeepers. Each safekeeper is associated
|
||||
with some element in message list and once it acknowledged receiving of the message, position is moved forward.
|
||||
Each queue element contains acknowledgment mask, which bits corresponds to safekeepers.
|
||||
Once all safekeepers acknowledged receiving of this message (by setting correspondent bit),
|
||||
then element can be removed from queue and `restartLSN` is advanced forward.
|
||||
|
||||
Proxy maintains `restartLSN` and `commitLSN` based on the responses received by safekeepers.
|
||||
Proposer maintains `restartLSN` and `commitLSN` based on the responses received by safekeepers.
|
||||
`RestartLSN` equals to the LSN of head message in the list. `CommitLSN` is `flushLSN[nSafekeepers-Quorum]` element
|
||||
in ordered array with `flushLSN`s of safekeepers. `CommitLSN` and `RestartLSN` are included in requests
|
||||
sent from proxy to safekeepers and stored in safekeepers control file.
|
||||
sent from proposer to safekeepers and stored in safekeepers control file.
|
||||
To avoid overhead of extra fsync, this control file is not fsynced on each request. Flushing this file is performed
|
||||
periodically, which means that `restartLSN`/`commitLSN` stored by safekeeper may be slightly deteriorated.
|
||||
It is not critical because may only cause redundant processing of some WAL record.
|
||||
And `FlushLSN` is recalculated after node restart by scanning local WAL files.
|
||||
|
||||
## Fault tolerance
|
||||
Once `safekeeper_proxy` looses connection to safekeeper it tries to reestablish this connection using the same nodeId.
|
||||
If `safekeeper_proxy` looses connection with master, it is terminated. Right now safekeeper is standalone process,
|
||||
which can be launched at any node, but it can be also spawned as master's background worker, so that it is automatically
|
||||
restarted in case of Postgres instance restart.
|
||||
If the WAL proposer process looses connection to safekeeper it tries to reestablish this connection using the same nodeId.
|
||||
|
||||
Restart of `safekeeper_proxy` initiates new round of voting and switching new epoch.
|
||||
Restart of PostgreSQL initiates new round of voting and switching new epoch.
|
||||
|
||||
## Limitations
|
||||
Right now message queue is maintained in main memory and is not spilled to the disk.
|
||||
@@ -161,7 +158,7 @@ It is assumed that in case of loosing local data by some safekeepers, it should
|
||||
## Algorithm
|
||||
|
||||
```python
|
||||
process SafekeeperProxy(safekeepers,server,curr_epoch,restart_lsn=0,message_queue={},feedbacks={})
|
||||
process WalProposer(safekeepers,server,curr_epoch,restart_lsn=0,message_queue={},feedbacks={})
|
||||
function do_recovery(epoch,restart_lsn,VCL)
|
||||
leader = i:safekeepers[i].state.epoch=epoch and safekeepers[i].state.flushLsn=VCL
|
||||
wal_stream = safekeepers[leader].start_replication(restart_lsn,VCL)
|
||||
@@ -241,31 +238,31 @@ end process
|
||||
|
||||
process safekeeper(gateway,state)
|
||||
function handshake()
|
||||
proxy = gateway.accept()
|
||||
server_info = proxy.read()
|
||||
proxy.write(state)
|
||||
proposal = proxy.read()
|
||||
proposer = gateway.accept()
|
||||
server_info = proposer.read()
|
||||
proposer.write(state)
|
||||
proposal = proposer.read()
|
||||
if proposal.nodeId < state.nodeId
|
||||
proxy.write(rejected)
|
||||
proposer.write(rejected)
|
||||
return null
|
||||
else
|
||||
state.nodeId = proposal.nodeId
|
||||
state.proposed_epoch = proposal.epoch
|
||||
state.VCL = proposal.VCL
|
||||
write_control_file(state)
|
||||
proxy.write(accepted)
|
||||
return proxy
|
||||
proposer.write(accepted)
|
||||
return proposer
|
||||
end function
|
||||
|
||||
state = read_control_file()
|
||||
state.flushLsn = locate_end_of_wal()
|
||||
|
||||
for ever
|
||||
proxy = handshake()
|
||||
if not proxy
|
||||
proposer = handshake()
|
||||
if not proposer
|
||||
continue
|
||||
for ever
|
||||
req = proxy.read()
|
||||
req = proposer.read()
|
||||
if req.nodeId != state.nodeId
|
||||
break
|
||||
save_wal_file(req.data)
|
||||
@@ -276,7 +273,7 @@ process safekeeper(gateway,state)
|
||||
state.flushLsn = req.endPos
|
||||
save_control_file(state)
|
||||
resp = Response(state.epoch,req.endPos)
|
||||
proxy.write(resp)
|
||||
proposer.write(resp)
|
||||
notify_wal_sender(Min(req.commitLsn,req.endPos))
|
||||
end process
|
||||
```
|
||||
|
||||
@@ -5,12 +5,10 @@ use anyhow::{Context, Result};
|
||||
use clap::{App, Arg};
|
||||
use daemonize::Daemonize;
|
||||
use log::*;
|
||||
use parse_duration::parse;
|
||||
use slog::Drain;
|
||||
use std::io;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::{fs::File, fs::OpenOptions};
|
||||
|
||||
use walkeeper::s3_offload;
|
||||
@@ -27,13 +25,6 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Path to the WAL acceptor data directory"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("systemid")
|
||||
.long("systemid")
|
||||
.takes_value(true)
|
||||
.required(true)
|
||||
.help("PostgreSQL system id, from pg_control"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen")
|
||||
.short("l")
|
||||
@@ -53,6 +44,12 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("interval for keeping WAL as walkeeper node, after which them will be uploaded to S3 and removed locally"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("recall")
|
||||
.long("recall")
|
||||
.takes_value(true)
|
||||
.help("Period for requestion pageserver to call for replication"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("daemonize")
|
||||
.short("d")
|
||||
@@ -69,17 +66,14 @@ fn main() -> Result<()> {
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let systemid_str = arg_matches.value_of("systemid").unwrap();
|
||||
let systemid: u64 = systemid_str.parse()?;
|
||||
|
||||
let mut conf = WalAcceptorConf {
|
||||
data_dir: PathBuf::from("./"),
|
||||
systemid,
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
pageserver_addr: None,
|
||||
listen_addr: "127.0.0.1:5454".parse()?,
|
||||
listen_addr: "localhost:5454".to_string(),
|
||||
ttl: None,
|
||||
recall_period: None,
|
||||
};
|
||||
|
||||
if let Some(dir) = arg_matches.value_of("datadir") {
|
||||
@@ -98,15 +92,19 @@ fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen") {
|
||||
conf.listen_addr = addr.parse().unwrap();
|
||||
conf.listen_addr = addr.to_owned();
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("pageserver") {
|
||||
conf.pageserver_addr = Some(addr.parse().unwrap());
|
||||
conf.pageserver_addr = Some(addr.to_owned());
|
||||
}
|
||||
|
||||
if let Some(ttl) = arg_matches.value_of("ttl") {
|
||||
conf.ttl = Some::<Duration>(parse(ttl)?);
|
||||
conf.ttl = Some(humantime::parse_duration(ttl)?);
|
||||
}
|
||||
|
||||
if let Some(recall) = arg_matches.value_of("recall") {
|
||||
conf.recall_period = Some(humantime::parse_duration(recall)?);
|
||||
}
|
||||
|
||||
start_wal_acceptor(conf)
|
||||
@@ -188,12 +186,19 @@ fn init_logging(
|
||||
if conf.daemonize {
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
record.level().is_at_least(slog::Level::Info)
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
} else {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build().fuse();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
record.level().is_at_least(slog::Level::Info)
|
||||
})
|
||||
.fuse();
|
||||
let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
//
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod pq_protocol;
|
||||
pub mod receive_wal;
|
||||
pub mod replication;
|
||||
pub mod s3_offload;
|
||||
@@ -11,15 +9,13 @@ pub mod send_wal;
|
||||
pub mod timeline;
|
||||
pub mod wal_service;
|
||||
|
||||
use crate::pq_protocol::SystemId;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WalAcceptorConf {
|
||||
pub data_dir: PathBuf,
|
||||
pub systemid: SystemId,
|
||||
pub daemonize: bool,
|
||||
pub no_sync: bool,
|
||||
pub listen_addr: SocketAddr,
|
||||
pub pageserver_addr: Option<SocketAddr>,
|
||||
pub listen_addr: String,
|
||||
pub pageserver_addr: Option<String>,
|
||||
pub ttl: Option<Duration>,
|
||||
pub recall_period: Option<Duration>,
|
||||
}
|
||||
|
||||
@@ -1,224 +0,0 @@
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use pageserver::ZTimelineId;
|
||||
use std::io::{self, Read};
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type SystemId = u64;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum FeMessage {
|
||||
Query(FeQueryMessage),
|
||||
Terminate,
|
||||
CopyData(FeCopyData),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RowDescriptor {
|
||||
pub typoid: Oid,
|
||||
pub typlen: i16,
|
||||
pub name: &'static [u8],
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum BeMessage<'a> {
|
||||
AuthenticationOk,
|
||||
ReadyForQuery,
|
||||
RowDescription(&'a [RowDescriptor]),
|
||||
DataRow(&'a [Option<&'a [u8]>]),
|
||||
CommandComplete(&'a [u8]),
|
||||
Negotiate,
|
||||
Copy,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FeStartupMessage {
|
||||
pub version: u32,
|
||||
pub kind: StartupRequestCode,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub appname: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum StartupRequestCode {
|
||||
Cancel,
|
||||
NegotiateSsl,
|
||||
NegotiateGss,
|
||||
Normal,
|
||||
}
|
||||
|
||||
impl FeStartupMessage {
|
||||
pub fn read_from(reader: &mut impl Read) -> io::Result<Self> {
|
||||
const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
|
||||
const CANCEL_REQUEST_CODE: u32 = (1234 << 16) | 5678;
|
||||
const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679;
|
||||
const NEGOTIATE_GSS_CODE: u32 = (1234 << 16) | 5680;
|
||||
|
||||
let len = reader.read_u32::<BigEndian>()? as usize;
|
||||
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"FeStartupMessage: invalid message length",
|
||||
));
|
||||
}
|
||||
|
||||
let version = reader.read_u32::<BigEndian>()?;
|
||||
|
||||
let kind = match version {
|
||||
CANCEL_REQUEST_CODE => StartupRequestCode::Cancel,
|
||||
NEGOTIATE_SSL_CODE => StartupRequestCode::NegotiateSsl,
|
||||
NEGOTIATE_GSS_CODE => StartupRequestCode::NegotiateGss,
|
||||
_ => StartupRequestCode::Normal,
|
||||
};
|
||||
|
||||
let params_len = len - 8;
|
||||
let mut params_bytes = vec![0u8; params_len];
|
||||
reader.read_exact(params_bytes.as_mut())?;
|
||||
|
||||
let params_str = str::from_utf8(¶ms_bytes).unwrap();
|
||||
let params = params_str.split('\0');
|
||||
let mut options = false;
|
||||
let mut timelineid: Option<ZTimelineId> = None;
|
||||
let mut appname: Option<String> = None;
|
||||
for p in params {
|
||||
if p == "options" {
|
||||
options = true;
|
||||
} else if options {
|
||||
for opt in p.split(' ') {
|
||||
if let Some(ztimelineid_str) = opt.strip_prefix("ztimelineid=") {
|
||||
// FIXME: rethrow parsing error, don't unwrap
|
||||
timelineid = Some(ZTimelineId::from_str(ztimelineid_str).unwrap());
|
||||
} else if let Some(val) = opt.strip_prefix("application_name=") {
|
||||
appname = Some(val.to_string());
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if timelineid.is_none() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"timelineid is required",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeStartupMessage {
|
||||
version,
|
||||
kind,
|
||||
appname,
|
||||
timelineid: timelineid.unwrap(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FeQueryMessage {
|
||||
pub body: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FeCopyData {
|
||||
pub body: Bytes,
|
||||
}
|
||||
|
||||
impl<'a> BeMessage<'a> {
|
||||
pub fn write(buf: &mut BytesMut, message: &BeMessage) {
|
||||
match message {
|
||||
BeMessage::AuthenticationOk => {
|
||||
buf.put_u8(b'R');
|
||||
buf.put_i32(4 + 4);
|
||||
buf.put_i32(0);
|
||||
}
|
||||
|
||||
BeMessage::ReadyForQuery => {
|
||||
buf.put_u8(b'Z');
|
||||
buf.put_i32(4 + 1);
|
||||
buf.put_u8(b'I');
|
||||
}
|
||||
|
||||
BeMessage::Negotiate => {
|
||||
buf.put_u8(b'N');
|
||||
}
|
||||
|
||||
BeMessage::Copy => {
|
||||
buf.put_u8(b'W');
|
||||
buf.put_i32(7);
|
||||
buf.put_u8(b'\0');
|
||||
buf.put_u8(b'\0');
|
||||
buf.put_u8(b'\0');
|
||||
}
|
||||
|
||||
BeMessage::RowDescription(rows) => {
|
||||
buf.put_u8(b'T');
|
||||
|
||||
let mut body = BytesMut::new();
|
||||
body.put_i16(rows.len() as i16); // # of fields
|
||||
for row in rows.iter() {
|
||||
body.put_slice(row.name);
|
||||
body.put_i32(0); /* table oid */
|
||||
body.put_i16(0); /* attnum */
|
||||
body.put_u32(row.typoid);
|
||||
body.put_i16(row.typlen);
|
||||
body.put_i32(-1); /* typmod */
|
||||
body.put_i16(0); /* format code */
|
||||
}
|
||||
buf.put_i32((4 + body.len()) as i32); // # of bytes, including len field itself
|
||||
buf.put(body);
|
||||
}
|
||||
|
||||
BeMessage::DataRow(vals) => {
|
||||
buf.put_u8(b'D');
|
||||
let total_len: usize = vals
|
||||
.iter()
|
||||
.fold(0, |acc, row| acc + 4 + row.map_or(0, |s| s.len()));
|
||||
buf.put_u32(4 + 2 + total_len as u32);
|
||||
buf.put_u16(vals.len() as u16);
|
||||
for val_opt in vals.iter() {
|
||||
if let Some(val) = val_opt {
|
||||
buf.put_u32(val.len() as u32);
|
||||
buf.put_slice(val);
|
||||
} else {
|
||||
buf.put_i32(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BeMessage::CommandComplete(cmd) => {
|
||||
buf.put_u8(b'C');
|
||||
buf.put_i32(4 + cmd.len() as i32);
|
||||
buf.put_slice(cmd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FeMessage {
|
||||
pub fn read_from(reader: &mut impl Read) -> io::Result<FeMessage> {
|
||||
let tag = reader.read_u8()?;
|
||||
let len = reader.read_u32::<BigEndian>()?;
|
||||
|
||||
if len < 4 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"FeMessage: invalid message length",
|
||||
));
|
||||
}
|
||||
|
||||
let body_len = (len - 4) as usize;
|
||||
let mut body = vec![0u8; body_len];
|
||||
reader.read_exact(&mut body)?;
|
||||
|
||||
match tag {
|
||||
b'Q' => Ok(FeMessage::Query(FeQueryMessage { body: body.into() })),
|
||||
b'd' => Ok(FeMessage::CopyData(FeCopyData { body: body.into() })),
|
||||
b'X' => Ok(FeMessage::Terminate),
|
||||
tag => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("unknown message tag: {},'{:?}'", tag, body),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use log::*;
|
||||
use postgres::{Client, NoTls};
|
||||
use postgres::{Client, Config, NoTls};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::{max, min};
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
@@ -12,17 +12,20 @@ use std::io::{BufReader, Read, Seek, SeekFrom, Write};
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::pq_protocol::*;
|
||||
use crate::replication::HotStandbyFeedback;
|
||||
use crate::timeline::{Timeline, TimelineTools};
|
||||
use crate::WalAcceptorConf;
|
||||
use pageserver::ZTimelineId;
|
||||
use postgres_ffi::xlog_utils::{TimeLineID, XLogFileName, MAX_SEND_SIZE, XLOG_BLCKSZ};
|
||||
use zenith_utils::pq_proto::SystemId;
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xCafeCeefu32;
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 1;
|
||||
const SK_PROTOCOL_VERSION: u32 = 1;
|
||||
const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
@@ -36,9 +39,9 @@ pub struct NodeId {
|
||||
uuid: [u8; 16],
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ServerInfo {
|
||||
/// proxy-safekeeper protocol version
|
||||
/// proposer-safekeeper protocol version
|
||||
pub protocol_version: u32,
|
||||
/// Postgres server version
|
||||
pub pg_version: u32,
|
||||
@@ -51,7 +54,7 @@ pub struct ServerInfo {
|
||||
pub wal_seg_size: u32,
|
||||
}
|
||||
|
||||
/// Vote request sent from proxy to safekeepers
|
||||
/// Vote request sent from proposer to safekeepers
|
||||
#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
||||
struct RequestVote {
|
||||
node_id: NodeId,
|
||||
@@ -87,7 +90,7 @@ impl SafeKeeperInfo {
|
||||
format_version: SK_FORMAT_VERSION,
|
||||
epoch: 0,
|
||||
server: ServerInfo {
|
||||
protocol_version: SK_PROTOCOL_VERSION, /* proxy-safekeeper protocol version */
|
||||
protocol_version: SK_PROTOCOL_VERSION, /* proposer-safekeeper protocol version */
|
||||
pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
|
||||
node_id: NodeId {
|
||||
term: 0,
|
||||
@@ -106,7 +109,7 @@ impl SafeKeeperInfo {
|
||||
}
|
||||
}
|
||||
|
||||
/// Request with WAL message sent from proxy to safekeeper.
|
||||
/// Request with WAL message sent from proposer to safekeeper.
|
||||
#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
||||
struct SafeKeeperRequest {
|
||||
/// Sender's node identifier (looks like we do not need it for TCP streaming connection)
|
||||
@@ -115,13 +118,13 @@ struct SafeKeeperRequest {
|
||||
begin_lsn: Lsn,
|
||||
/// end position of message in WAL
|
||||
end_lsn: Lsn,
|
||||
/// restart LSN position (minimal LSN which may be needed by proxy to perform recovery)
|
||||
/// restart LSN position (minimal LSN which may be needed by proposer to perform recovery)
|
||||
restart_lsn: Lsn,
|
||||
/// LSN committed by quorum of safekeepers
|
||||
commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Report safekeeper state to proxy
|
||||
/// Report safekeeper state to proposer
|
||||
#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
||||
struct SafeKeeperResponse {
|
||||
epoch: u64,
|
||||
@@ -142,6 +145,44 @@ pub struct ReceiveWalConn {
|
||||
pub conf: WalAcceptorConf,
|
||||
}
|
||||
|
||||
///
|
||||
/// Periodically request pageserver to call back.
|
||||
/// If pageserver already has replication channel, it will just ignore this request
|
||||
///
|
||||
fn request_callback(conf: WalAcceptorConf, timelineid: ZTimelineId) {
|
||||
let ps_addr = conf.pageserver_addr.unwrap();
|
||||
let ps_connstr = format!("postgresql://no_user@{}/no_db", ps_addr);
|
||||
|
||||
// use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses
|
||||
let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_addr);
|
||||
let me_conf: Config = me_connstr.parse().unwrap();
|
||||
let (host, port) = connection_host_port(&me_conf);
|
||||
let callme = format!(
|
||||
"callmemaybe {} host={} port={} options='-c ztimelineid={}'",
|
||||
timelineid, host, port, timelineid
|
||||
);
|
||||
loop {
|
||||
info!(
|
||||
"requesting page server to connect to us: start {} {}",
|
||||
ps_connstr, callme
|
||||
);
|
||||
match Client::connect(&ps_connstr, NoTls) {
|
||||
Ok(mut client) => {
|
||||
if let Err(e) = client.simple_query(&callme) {
|
||||
error!("Failed to send callme request to pageserver: {}", e);
|
||||
}
|
||||
}
|
||||
Err(e) => error!("Failed to connect to pageserver {}: {}", &ps_connstr, e),
|
||||
}
|
||||
|
||||
if let Some(period) = conf.recall_period {
|
||||
sleep(period);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ReceiveWalConn {
|
||||
pub fn new(socket: TcpStream, conf: WalAcceptorConf) -> Result<ReceiveWalConn> {
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
@@ -160,32 +201,6 @@ impl ReceiveWalConn {
|
||||
Ok(T::des_from(&mut self.stream_in)?)
|
||||
}
|
||||
|
||||
fn request_callback(&self) -> std::result::Result<(), postgres::error::Error> {
|
||||
if let Some(addr) = self.conf.pageserver_addr {
|
||||
let ps_connstr = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
addr.ip(),
|
||||
addr.port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let callme = format!(
|
||||
"callmemaybe {} host={} port={} options='-c ztimelineid={}'",
|
||||
self.timeline.get().timelineid,
|
||||
self.conf.listen_addr.ip(),
|
||||
self.conf.listen_addr.port(),
|
||||
self.timeline.get().timelineid
|
||||
);
|
||||
info!(
|
||||
"requesting page server to connect to us: start {} {}",
|
||||
ps_connstr, callme
|
||||
);
|
||||
let mut client = Client::connect(&ps_connstr, NoTls)?;
|
||||
client.simple_query(&callme)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Receive WAL from wal_proposer
|
||||
pub fn run(&mut self) -> Result<()> {
|
||||
// Receive information about server
|
||||
@@ -220,7 +235,7 @@ impl ReceiveWalConn {
|
||||
|
||||
/* Update information about server, but preserve locally stored node_id */
|
||||
let node_id = my_info.server.node_id;
|
||||
my_info.server = server_info;
|
||||
my_info.server = server_info.clone();
|
||||
my_info.server.node_id = node_id;
|
||||
|
||||
/* Calculate WAL end based on local data */
|
||||
@@ -228,14 +243,14 @@ impl ReceiveWalConn {
|
||||
my_info.flush_lsn = flush_lsn;
|
||||
my_info.server.timeline = timeline;
|
||||
|
||||
/* Report my identifier to proxy */
|
||||
/* Report my identifier to proposer */
|
||||
my_info.ser_into(&mut self.stream_out)?;
|
||||
|
||||
/* Wait for vote request */
|
||||
let prop = self.read_req::<RequestVote>()?;
|
||||
/* This is Paxos check which should ensure that only one master can perform commits */
|
||||
if prop.node_id < my_info.server.node_id {
|
||||
/* Send my node-id to inform proxy that it's candidate was rejected */
|
||||
/* Send my node-id to inform proposer that it's candidate was rejected */
|
||||
my_info.server.node_id.ser_into(&mut self.stream_out)?;
|
||||
bail!(
|
||||
"Reject connection attempt with term {} because my term is {}",
|
||||
@@ -251,15 +266,17 @@ impl ReceiveWalConn {
|
||||
let mut flushed_restart_lsn = Lsn(0);
|
||||
let wal_seg_size = server_info.wal_seg_size as usize;
|
||||
|
||||
/* Acknowledge the proposed candidate by returning it to the proxy */
|
||||
/* Acknowledge the proposed candidate by returning it to the proposer */
|
||||
prop.node_id.ser_into(&mut self.stream_out)?;
|
||||
|
||||
// Need to establish replication channel with page server.
|
||||
// Add far as replication in postgres is initiated by receiver, we should use callme mechanism
|
||||
if let Err(e) = self.request_callback() {
|
||||
// Do not treate it as fatal error and continue work
|
||||
// FIXME: we should retry after a while...
|
||||
error!("Failed to send callme request to pageserver: {}", e);
|
||||
if self.conf.pageserver_addr.is_some() {
|
||||
// Need to establish replication channel with page server.
|
||||
// Add far as replication in postgres is initiated by receiver, we should use callme mechanism
|
||||
let conf = self.conf.clone();
|
||||
let timelineid = self.timeline.get().timelineid;
|
||||
thread::spawn(move || {
|
||||
request_callback(conf, timelineid);
|
||||
});
|
||||
}
|
||||
|
||||
info!(
|
||||
@@ -303,7 +320,7 @@ impl ReceiveWalConn {
|
||||
/*
|
||||
* Epoch switch happen when written WAL record cross the boundary.
|
||||
* The boundary is maximum of last WAL position at this node (FlushLSN) and global
|
||||
* maximum (vcl) determined by safekeeper_proxy during handshake.
|
||||
* maximum (vcl) determined by WAL proposer during handshake.
|
||||
* Switching epoch means that node completes recovery and start writing in the WAL new data.
|
||||
*/
|
||||
if my_info.epoch < prop.epoch && end_pos > max(my_info.flush_lsn, prop.vcl) {
|
||||
|
||||
@@ -1,29 +1,28 @@
|
||||
//! This module implements the replication protocol, starting with the
|
||||
//! "START REPLICATION" message.
|
||||
//! This module implements the streaming side of replication protocol, starting
|
||||
//! with the "START REPLICATION" message.
|
||||
|
||||
use crate::pq_protocol::{BeMessage, FeMessage};
|
||||
use crate::send_wal::SendWalConn;
|
||||
use crate::send_wal::SendWalHandler;
|
||||
use crate::timeline::{Timeline, TimelineTools};
|
||||
use crate::WalAcceptorConf;
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use anyhow::{anyhow, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, XLogFileName, MAX_SEND_SIZE};
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::min;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Read, Seek, SeekFrom, Write};
|
||||
use std::io::{BufReader, Read, Seek, SeekFrom};
|
||||
use std::net::TcpStream;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::thread::sleep;
|
||||
use std::time::Duration;
|
||||
use std::{str, thread};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::pq_proto::{BeMessage, FeMessage, XLogDataBody};
|
||||
|
||||
const XLOG_HDR_SIZE: usize = 1 + 8 * 3; /* 'w' + startPos + walEnd + timestamp */
|
||||
const LIBPQ_HDR_SIZE: usize = 5; /* 1 byte with message type + 4 bytes length */
|
||||
const LIBPQ_MSG_SIZE_OFFS: usize = 1;
|
||||
pub const END_REPLICATION_MARKER: Lsn = Lsn::MAX;
|
||||
|
||||
type FullTransactionId = u64;
|
||||
@@ -38,29 +37,16 @@ pub struct HotStandbyFeedback {
|
||||
|
||||
/// A network connection that's speaking the replication protocol.
|
||||
pub struct ReplicationConn {
|
||||
timeline: Option<Arc<Timeline>>,
|
||||
/// Postgres connection, buffered input
|
||||
///
|
||||
/// This is an `Option` because we will spawn a background thread that will
|
||||
/// `take` it from us.
|
||||
stream_in: Option<BufReader<TcpStream>>,
|
||||
/// Postgres connection, output
|
||||
stream_out: TcpStream,
|
||||
/// wal acceptor configuration
|
||||
conf: WalAcceptorConf,
|
||||
/// assigned application name
|
||||
appname: Option<String>,
|
||||
}
|
||||
|
||||
impl ReplicationConn {
|
||||
/// Create a new `SendWal`, consuming the `Connection`.
|
||||
pub fn new(conn: SendWalConn) -> Self {
|
||||
/// Create a new `ReplicationConn`
|
||||
pub fn new(pgb: &mut PostgresBackend) -> Self {
|
||||
Self {
|
||||
timeline: conn.timeline,
|
||||
stream_in: Some(conn.stream_in),
|
||||
stream_out: conn.stream_out,
|
||||
conf: conn.conf,
|
||||
appname: None,
|
||||
stream_in: pgb.take_stream_in(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,9 +58,9 @@ impl ReplicationConn {
|
||||
// Wait for replica's feedback.
|
||||
// We only handle `CopyData` messages. Anything else is ignored.
|
||||
loop {
|
||||
match FeMessage::read_from(&mut stream_in)? {
|
||||
FeMessage::CopyData(m) => {
|
||||
let feedback = HotStandbyFeedback::des(&m.body)?;
|
||||
match FeMessage::read(&mut stream_in)? {
|
||||
Some(FeMessage::CopyData(m)) => {
|
||||
let feedback = HotStandbyFeedback::des(&m)?;
|
||||
timeline.add_hs_feedback(feedback)
|
||||
}
|
||||
msg => {
|
||||
@@ -87,7 +73,7 @@ impl ReplicationConn {
|
||||
/// Helper function that parses a pair of LSNs.
|
||||
fn parse_start_stop(cmd: &[u8]) -> Result<(Lsn, Lsn)> {
|
||||
let re = Regex::new(r"([[:xdigit:]]+/[[:xdigit:]]+)").unwrap();
|
||||
let caps = re.captures_iter(str::from_utf8(&cmd[..])?);
|
||||
let caps = re.captures_iter(str::from_utf8(cmd)?);
|
||||
let mut lsns = caps.map(|cap| cap[1].parse::<Lsn>());
|
||||
let start_pos = lsns
|
||||
.next()
|
||||
@@ -107,10 +93,10 @@ impl ReplicationConn {
|
||||
|
||||
// If that failed, try it without the .partial extension.
|
||||
match File::open(&wal_file_path) {
|
||||
Ok(opened_file) => return Ok(opened_file),
|
||||
Ok(opened_file) => Ok(opened_file),
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e.into());
|
||||
Err(e.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -118,9 +104,14 @@ impl ReplicationConn {
|
||||
///
|
||||
/// Handle START_REPLICATION replication command
|
||||
///
|
||||
pub fn run(&mut self, cmd: &Bytes) -> Result<()> {
|
||||
pub fn run(
|
||||
&mut self,
|
||||
swh: &mut SendWalHandler,
|
||||
pgb: &mut PostgresBackend,
|
||||
cmd: &Bytes,
|
||||
) -> Result<()> {
|
||||
// spawn the background thread which receives HotStandbyFeedback messages.
|
||||
let bg_timeline = Arc::clone(self.timeline.get());
|
||||
let bg_timeline = Arc::clone(swh.timeline.get());
|
||||
let bg_stream_in = self.stream_in.take().unwrap();
|
||||
|
||||
thread::spawn(move || {
|
||||
@@ -131,23 +122,27 @@ impl ReplicationConn {
|
||||
|
||||
let (mut start_pos, mut stop_pos) = Self::parse_start_stop(&cmd)?;
|
||||
|
||||
let wal_seg_size = self.timeline.get().get_info().server.wal_seg_size as usize;
|
||||
if wal_seg_size == 0 {
|
||||
bail!("Can not start replication before connecting to wal_proposer");
|
||||
let mut wal_seg_size: usize;
|
||||
loop {
|
||||
wal_seg_size = swh.timeline.get().get_info().server.wal_seg_size as usize;
|
||||
if wal_seg_size == 0 {
|
||||
error!("Can not start replication before connecting to wal_proposer");
|
||||
sleep(Duration::from_secs(1));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let (wal_end, timeline) = self.timeline.find_end_of_wal(&self.conf.data_dir, false);
|
||||
let (wal_end, timeline) = swh.timeline.find_end_of_wal(&swh.conf.data_dir, false);
|
||||
if start_pos == Lsn(0) {
|
||||
start_pos = wal_end;
|
||||
}
|
||||
if stop_pos == Lsn(0) && self.appname == Some("wal_proposer_recovery".to_string()) {
|
||||
if stop_pos == Lsn(0) && swh.appname == Some("wal_proposer_recovery".to_string()) {
|
||||
stop_pos = wal_end;
|
||||
}
|
||||
info!("Start replication from {} till {}", start_pos, stop_pos);
|
||||
|
||||
let mut outbuf = BytesMut::new();
|
||||
BeMessage::write(&mut outbuf, &BeMessage::Copy);
|
||||
self.send(&outbuf)?;
|
||||
outbuf.clear();
|
||||
// switch to copy
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
|
||||
let mut end_pos: Lsn;
|
||||
let mut wal_file: Option<File> = None;
|
||||
@@ -163,7 +158,7 @@ impl ReplicationConn {
|
||||
end_pos = stop_pos;
|
||||
} else {
|
||||
/* normal mode */
|
||||
let timeline = self.timeline.get();
|
||||
let timeline = swh.timeline.get();
|
||||
end_pos = timeline.wait_for_lsn(start_pos);
|
||||
}
|
||||
if end_pos == END_REPLICATION_MARKER {
|
||||
@@ -177,8 +172,8 @@ impl ReplicationConn {
|
||||
// Open a new file.
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
let wal_file_name = XLogFileName(timeline, segno, wal_seg_size);
|
||||
let timeline_id = self.timeline.get().timelineid.to_string();
|
||||
let wal_file_path = self.conf.data_dir.join(timeline_id).join(wal_file_name);
|
||||
let timeline_id = swh.timeline.get().timelineid.to_string();
|
||||
let wal_file_path = swh.conf.data_dir.join(timeline_id).join(wal_file_name);
|
||||
Self::open_wal_file(&wal_file_path)?
|
||||
}
|
||||
};
|
||||
@@ -191,32 +186,19 @@ impl ReplicationConn {
|
||||
let send_size = min(send_size, wal_seg_size - xlogoff);
|
||||
let send_size = min(send_size, MAX_SEND_SIZE);
|
||||
|
||||
let msg_size = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + send_size;
|
||||
|
||||
// Read some data from the file.
|
||||
let mut file_buf = vec![0u8; send_size];
|
||||
file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
file.read_exact(&mut file_buf)?;
|
||||
|
||||
// Write some data to the network socket.
|
||||
// FIXME: turn these into structs.
|
||||
// 'd' is CopyData;
|
||||
// 'w' is "WAL records"
|
||||
// https://www.postgresql.org/docs/9.1/protocol-message-formats.html
|
||||
// src/backend/replication/walreceiver.c
|
||||
outbuf.clear();
|
||||
outbuf.put_u8(b'd');
|
||||
outbuf.put_u32((msg_size - LIBPQ_MSG_SIZE_OFFS) as u32);
|
||||
outbuf.put_u8(b'w');
|
||||
outbuf.put_u64(start_pos.0);
|
||||
outbuf.put_u64(end_pos.0);
|
||||
outbuf.put_u64(get_current_timestamp());
|
||||
pgb.write_message(&BeMessage::XLogData(XLogDataBody {
|
||||
wal_start: start_pos.0,
|
||||
wal_end: end_pos.0,
|
||||
timestamp: get_current_timestamp(),
|
||||
data: &file_buf,
|
||||
}))?;
|
||||
|
||||
assert!(outbuf.len() + file_buf.len() == msg_size);
|
||||
// This thread has exclusive access to the TcpStream, so it's fine
|
||||
// to do this as two separate calls.
|
||||
self.send(&outbuf)?;
|
||||
self.send(&file_buf)?;
|
||||
start_pos += send_size as u64;
|
||||
|
||||
debug!("Sent WAL to page server up to {}", end_pos);
|
||||
@@ -229,10 +211,4 @@ impl ReplicationConn {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Send messages on the network.
|
||||
fn send(&mut self, buf: &[u8]) -> Result<()> {
|
||||
self.stream_out.write_all(buf.as_ref())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,8 +12,7 @@ use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::fs::{self, File};
|
||||
use std::io::prelude::*;
|
||||
use std::iter::FromIterator;
|
||||
use std::path::PathBuf;
|
||||
use std::path::Path;
|
||||
use std::time::SystemTime;
|
||||
use tokio::runtime;
|
||||
use tokio::time::sleep;
|
||||
@@ -42,7 +41,7 @@ pub fn thread_main(conf: WalAcceptorConf) {
|
||||
async fn offload_files(
|
||||
bucket: &Bucket,
|
||||
listing: &HashSet<String>,
|
||||
dir_path: &PathBuf,
|
||||
dir_path: &Path,
|
||||
conf: &WalAcceptorConf,
|
||||
) -> Result<u64> {
|
||||
let horizon = SystemTime::now() - conf.ttl.unwrap();
|
||||
@@ -93,11 +92,10 @@ async fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
|
||||
let results = bucket
|
||||
.list("walarchive/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
let listing = HashSet::from_iter(
|
||||
results
|
||||
.iter()
|
||||
.flat_map(|b| b.contents.iter().map(|o| o.key.clone())),
|
||||
);
|
||||
let listing = results
|
||||
.iter()
|
||||
.flat_map(|b| b.contents.iter().map(|o| o.key.clone()))
|
||||
.collect();
|
||||
|
||||
let n = offload_files(&bucket, &listing, &conf.data_dir, conf).await?;
|
||||
info!("Offload {} files to S3", n);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user