mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-04 11:10:37 +00:00
Compare commits
238 Commits
set_hints_
...
buffered_r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3e08ad485a | ||
|
|
5ad82418a9 | ||
|
|
92562145c0 | ||
|
|
915001c67e | ||
|
|
13f9565ff8 | ||
|
|
f73d043a8b | ||
|
|
8fda7a6183 | ||
|
|
4acd292717 | ||
|
|
b365a075f4 | ||
|
|
6311135d73 | ||
|
|
ee29446edc | ||
|
|
d2e5e0e728 | ||
|
|
3b471494ff | ||
|
|
9947de4a2a | ||
|
|
a3e94e888a | ||
|
|
e6f33a5cd0 | ||
|
|
2dd35b1fbe | ||
|
|
ce779cc754 | ||
|
|
497258c6fe | ||
|
|
0b6008012d | ||
|
|
d35fc20181 | ||
|
|
e42c884c2b | ||
|
|
eb706bc9f4 | ||
|
|
798df756de | ||
|
|
732d13fe06 | ||
|
|
feae7f39c1 | ||
|
|
c2b468c958 | ||
|
|
e272a380b4 | ||
|
|
0dc7a3fc15 | ||
|
|
a1bc0ada59 | ||
|
|
e9b5224a8a | ||
|
|
bdd039a9ee | ||
|
|
b405eef324 | ||
|
|
ba557d126b | ||
|
|
2dde20a227 | ||
|
|
4ade0bb41c | ||
|
|
100da024b6 | ||
|
|
de744a44dd | ||
|
|
0e026371ec | ||
|
|
4b87acb1f6 | ||
|
|
43957f4401 | ||
|
|
8a4f092e82 | ||
|
|
6b6b3f68be | ||
|
|
96f1175a80 | ||
|
|
1c29de81de | ||
|
|
f658263543 | ||
|
|
64ca947722 | ||
|
|
23f4c0a742 | ||
|
|
7c5b99683c | ||
|
|
160c4aff61 | ||
|
|
6e5ca5dc5c | ||
|
|
f3445949d1 | ||
|
|
95a85312f5 | ||
|
|
934fb8592f | ||
|
|
bb239b4f69 | ||
|
|
1cd7900790 | ||
|
|
8c61c3e54e | ||
|
|
d7c9dd06f4 | ||
|
|
b9119f11bf | ||
|
|
7216f22609 | ||
|
|
bf58f7f649 | ||
|
|
3f0ebc6a40 | ||
|
|
0baf4bc796 | ||
|
|
c356030660 | ||
|
|
c4bb6d78d4 | ||
|
|
3b82e806f2 | ||
|
|
403d9779d9 | ||
|
|
b3b8f18f61 | ||
|
|
960c7d69a8 | ||
|
|
60dae0b4ac | ||
|
|
c660926a06 | ||
|
|
7fa04e2d14 | ||
|
|
db4059cd6d | ||
|
|
fdb19fdb92 | ||
|
|
53b4dc944d | ||
|
|
a03e1b3895 | ||
|
|
15f1bcc9c2 | ||
|
|
24580f2493 | ||
|
|
e3945d94fd | ||
|
|
d806c3a47e | ||
|
|
05fe39088b | ||
|
|
530d3eaf09 | ||
|
|
7e190d72a5 | ||
|
|
9c936034b6 | ||
|
|
5719f13cb2 | ||
|
|
d134a9856e | ||
|
|
664b99b5ac | ||
|
|
4256231eb7 | ||
|
|
ae27490281 | ||
|
|
fbd8ca2ff4 | ||
|
|
ec673a5d67 | ||
|
|
7fab38c51e | ||
|
|
84f7dcd052 | ||
|
|
7095a5d551 | ||
|
|
538c2a2a3e | ||
|
|
62f83869f1 | ||
|
|
69670b61c4 | ||
|
|
0a8aaa2c24 | ||
|
|
e474790400 | ||
|
|
2c99e2461a | ||
|
|
cf8e27a554 | ||
|
|
287ea2e5e3 | ||
|
|
86e14f2f1a | ||
|
|
adbae62281 | ||
|
|
3127a4a13b | ||
|
|
6d993410c9 | ||
|
|
fb05e4cb0b | ||
|
|
b0a7234759 | ||
|
|
ddf4b15ebc | ||
|
|
3065532f15 | ||
|
|
d6fc74a412 | ||
|
|
7a370394a7 | ||
|
|
0f3cf8ac94 | ||
|
|
014be8b230 | ||
|
|
08978458be | ||
|
|
2252d9faa8 | ||
|
|
22e15844ae | ||
|
|
ca9af37478 | ||
|
|
aae41e8661 | ||
|
|
8331ce865c | ||
|
|
3bac4d485d | ||
|
|
f84eaf4f05 | ||
|
|
70b08923ed | ||
|
|
c846a824de | ||
|
|
b71e3a40e2 | ||
|
|
41dfc117e7 | ||
|
|
a72707b8cb | ||
|
|
0f770967b4 | ||
|
|
bd9f4794d9 | ||
|
|
ff5cbe2694 | ||
|
|
2319e0ec8f | ||
|
|
d4e037f1e7 | ||
|
|
139936197a | ||
|
|
d4eed61f57 | ||
|
|
7db3a9e7d9 | ||
|
|
c81ee3bd5b | ||
|
|
7fb7f67bb4 | ||
|
|
86164c8b33 | ||
|
|
97c4cd4434 | ||
|
|
a4fc6da57b | ||
|
|
c934e724a8 | ||
|
|
e554f9514f | ||
|
|
d7cff8fbaf | ||
|
|
90ef661673 | ||
|
|
579b5ee944 | ||
|
|
8ebf2fe550 | ||
|
|
16d3dc821a | ||
|
|
a91eeb1c65 | ||
|
|
49c8c03465 | ||
|
|
5344ffc3de | ||
|
|
296586b7ce | ||
|
|
b7aac87ec1 | ||
|
|
ea4c3639e3 | ||
|
|
745627c8ca | ||
|
|
c2af6d98db | ||
|
|
540973eac4 | ||
|
|
ad5f16f724 | ||
|
|
1aa7218fd6 | ||
|
|
1d5abf1253 | ||
|
|
7b3fb760fa | ||
|
|
3743344e64 | ||
|
|
bbe4f39790 | ||
|
|
7dda9f2894 | ||
|
|
8de41f1d70 | ||
|
|
6984d33b4e | ||
|
|
98d4f9cea5 | ||
|
|
87bc18972f | ||
|
|
25b7d424ab | ||
|
|
a5bd306db9 | ||
|
|
0cbee4a416 | ||
|
|
91ff09151d | ||
|
|
fea5954b18 | ||
|
|
b11b0bb088 | ||
|
|
0ede933719 | ||
|
|
3ab60ce76f | ||
|
|
01ef2baef0 | ||
|
|
6a2e4bfdd9 | ||
|
|
9563336d9a | ||
|
|
4ebe643d0c | ||
|
|
dc897fb864 | ||
|
|
a2498f3e67 | ||
|
|
d150f3ce8c | ||
|
|
cff4572774 | ||
|
|
84008a2560 | ||
|
|
6b7f3bc78c | ||
|
|
a68c23448a | ||
|
|
9043f45489 | ||
|
|
6afd99c73f | ||
|
|
18b5165b22 | ||
|
|
6dc66eefb6 | ||
|
|
0aec60938a | ||
|
|
7c62a57e54 | ||
|
|
59e7ca585d | ||
|
|
3dea06b825 | ||
|
|
ab33614ab1 | ||
|
|
03dff207db | ||
|
|
6a8785379a | ||
|
|
507177b42e | ||
|
|
b79754d06e | ||
|
|
674807eee1 | ||
|
|
30c0343727 | ||
|
|
4fae115dc2 | ||
|
|
3d17255400 | ||
|
|
5488ce8834 | ||
|
|
d7313bb85c | ||
|
|
4b73ada26e | ||
|
|
b4ecae33e4 | ||
|
|
1b9e49eb60 | ||
|
|
7a03e32dd5 | ||
|
|
018a606987 | ||
|
|
26782851a9 | ||
|
|
04ee1d5977 | ||
|
|
6245702c7c | ||
|
|
9098f2159d | ||
|
|
292bdaa6a7 | ||
|
|
6f0c065743 | ||
|
|
94c50e3e90 | ||
|
|
f83108002b | ||
|
|
511873aaed | ||
|
|
eb3fd7a8da | ||
|
|
a3214e982d | ||
|
|
1e172230ce | ||
|
|
51d36b9930 | ||
|
|
d1f0b1eda4 | ||
|
|
ed4eed0a19 | ||
|
|
2cf3a70be5 | ||
|
|
6d42ea47bf | ||
|
|
b227c63edf | ||
|
|
45c09c1cdd | ||
|
|
66dcaa4e01 | ||
|
|
a7de53d4c4 | ||
|
|
fabf5ec664 | ||
|
|
c6678c5dea | ||
|
|
1686715ad0 | ||
|
|
7507f4b309 | ||
|
|
bc709561b6 | ||
|
|
0e4cbe0165 | ||
|
|
66929ad6fb |
@@ -7,7 +7,7 @@ executors:
|
||||
zenith-build-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
- image: cimg/rust:1.52.1
|
||||
- image: cimg/rust:1.55.0
|
||||
|
||||
jobs:
|
||||
check-codestyle:
|
||||
@@ -24,6 +24,12 @@ jobs:
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: zenith-build-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
enum: ["debug", "release"]
|
||||
environment:
|
||||
BUILD_TYPE: << parameters.build_type >>
|
||||
steps:
|
||||
# Checkout the git repo (circleci doesn't have a flag to enable submodules here)
|
||||
- checkout
|
||||
@@ -39,7 +45,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
# FIXME We could cache our own docker container, instead of installing packages every time.
|
||||
- run:
|
||||
@@ -59,12 +65,12 @@ jobs:
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
# "depth 1" saves some time by not cloning the whole repo
|
||||
git submodule update --init --depth 1
|
||||
make postgres
|
||||
make postgres -j8
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save postgres cache
|
||||
key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
@@ -96,7 +102,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
@@ -104,7 +110,7 @@ jobs:
|
||||
# Require an exact match. While an out of date cache might speed up the build,
|
||||
# there's no way to clean out old packages, so the cache grows every time something
|
||||
# changes.
|
||||
- v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
- v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
|
||||
# Build the rust code, including test binaries
|
||||
- run:
|
||||
@@ -122,12 +128,20 @@ jobs:
|
||||
|
||||
- save_cache:
|
||||
name: Save rust cache
|
||||
key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
paths:
|
||||
- ~/.cargo/registry
|
||||
- ~/.cargo/git
|
||||
- target
|
||||
|
||||
# Run style checks
|
||||
# has to run separately from cargo fmt section
|
||||
# since needs to run with dependencies
|
||||
- run:
|
||||
name: clippy
|
||||
command: |
|
||||
./run_clippy.sh
|
||||
|
||||
# Run rust unit tests
|
||||
- run: cargo test
|
||||
|
||||
@@ -168,6 +182,21 @@ jobs:
|
||||
paths:
|
||||
- "*"
|
||||
|
||||
check-python:
|
||||
executor: python/default
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Install pipenv & deps
|
||||
working_directory: test_runner
|
||||
command: |
|
||||
pip install pipenv
|
||||
pipenv install --dev
|
||||
- run:
|
||||
name: Run yapf to ensure code format
|
||||
working_directory: test_runner
|
||||
command: pipenv run yapf --recursive --diff .
|
||||
|
||||
run-pytest:
|
||||
#description: "Run pytest"
|
||||
executor: python/default
|
||||
@@ -193,6 +222,9 @@ jobs:
|
||||
needs_postgres_source:
|
||||
type: boolean
|
||||
default: false
|
||||
run_in_parallel:
|
||||
type: boolean
|
||||
default: true
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
@@ -221,16 +253,20 @@ jobs:
|
||||
echo "test_selection must be set"
|
||||
exit 1
|
||||
fi
|
||||
if << parameters.run_in_parallel >>; then
|
||||
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||
fi;
|
||||
# Run the tests.
|
||||
#
|
||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||
# in its "Tests" tab in the results page.
|
||||
# -s prevents pytest from capturing output, which helps to see
|
||||
# what's going on if the test hangs
|
||||
# --verbose prints name of each test (helpful when there are
|
||||
# multiple tests in one file)
|
||||
# -rA prints summary in the end
|
||||
pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short -s --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||
# in parallel and logs are mixed between different tests
|
||||
pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
- run:
|
||||
# CircleCI artifacts are preserved one file at a time, so skipping
|
||||
# this step isn't a good idea. If you want to extract the
|
||||
@@ -239,7 +275,7 @@ jobs:
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" -delete
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
@@ -264,20 +300,69 @@ jobs:
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
|
||||
|
||||
# Trigger a new remote CI job
|
||||
remote-ci-trigger:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
parameters:
|
||||
remote_repo:
|
||||
type: string
|
||||
environment:
|
||||
REMOTE_REPO: << parameters.remote_repo >>
|
||||
steps:
|
||||
- run:
|
||||
name: Set PR's status to pending
|
||||
command: |
|
||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "$CI_ACCESS_TOKEN" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"zenith-remote-ci\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
- run:
|
||||
name: Request a remote CI test
|
||||
command: |
|
||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "$CI_ACCESS_TOKEN" \
|
||||
--data \
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"zenith-remote-ci\",
|
||||
\"commit_hash\": \"$CIRCLE_SHA1\",
|
||||
\"remote_repo\": \"$LOCAL_REPO\"
|
||||
}
|
||||
}"
|
||||
|
||||
workflows:
|
||||
build_and_test:
|
||||
jobs:
|
||||
- check-codestyle
|
||||
- build-postgres
|
||||
- check-python
|
||||
- build-postgres:
|
||||
name: build-postgres-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
requires:
|
||||
- build-postgres
|
||||
- build-postgres-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: pg_regress tests << matrix.build_type >>
|
||||
name: pg_regress-tests-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
@@ -286,7 +371,7 @@ workflows:
|
||||
requires:
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: other tests << matrix.build_type >>
|
||||
name: other-tests-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
@@ -297,6 +382,7 @@ workflows:
|
||||
name: benchmarks
|
||||
build_type: release
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
requires:
|
||||
- build-zenith-release
|
||||
- docker-image:
|
||||
@@ -308,5 +394,16 @@ workflows:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- pg_regress tests release
|
||||
- other tests release
|
||||
- pg_regress-tests-release
|
||||
- other-tests-release
|
||||
- remote-ci-trigger:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
remote_repo: "zenithdb/console"
|
||||
requires:
|
||||
# XXX: Successful build doesn't mean everything is OK, but
|
||||
# the job to be triggered takes so much time to complete (~22 min)
|
||||
# that it's better not to wait for the commented-out steps
|
||||
- build-zenith-debug
|
||||
# - pg_regress-tests-release
|
||||
# - other-tests-release
|
||||
|
||||
@@ -2,12 +2,17 @@
|
||||
**/__pycache__
|
||||
**/.pytest_cache
|
||||
|
||||
/target
|
||||
/tmp_check
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
/test_output
|
||||
/.vscode
|
||||
/.zenith
|
||||
/integration_tests/.zenith
|
||||
/Dockerfile
|
||||
.git
|
||||
target
|
||||
tmp_check
|
||||
tmp_install
|
||||
tmp_check_cli
|
||||
test_output
|
||||
.vscode
|
||||
.zenith
|
||||
integration_tests/.zenith
|
||||
.mypy_cache
|
||||
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
|
||||
|
||||
399
Cargo.lock
generated
399
Cargo.lock
generated
@@ -26,18 +26,21 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ansi_term"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.42"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "595d3cfa7a60d4555cb5067b99f07142a08ea778de5cf993f7b75c7d8fabc486"
|
||||
|
||||
[[package]]
|
||||
name = "arc-swap"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e906254e445520903e7fc9da4f709886c84ae4bc4ddaf0e093188d66df4dc820"
|
||||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.50"
|
||||
@@ -298,7 +301,7 @@ version = "2.33.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
|
||||
dependencies = [
|
||||
"ansi_term",
|
||||
"ansi_term 0.11.0",
|
||||
"atty",
|
||||
"bitflags",
|
||||
"strsim",
|
||||
@@ -307,6 +310,26 @@ dependencies = [
|
||||
"vec_map",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_format"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4556f63e28a78fa5e6f310cfea5647a25636def49a338ab69e33b34a3382057b"
|
||||
dependencies = [
|
||||
"const_format_proc_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_format_proc_macros"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "552782506c398da94466b364973b563887e0ca078bf33a76d4163736165e3594"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "control_plane"
|
||||
version = "0.1.0"
|
||||
@@ -325,6 +348,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"toml",
|
||||
"url",
|
||||
"walkeeper",
|
||||
@@ -366,26 +390,6 @@ dependencies = [
|
||||
"rustc_version",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crypto-mac"
|
||||
version = "0.10.0"
|
||||
@@ -424,16 +428,6 @@ dependencies = [
|
||||
"dirs-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-next"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"dirs-sys-next",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys"
|
||||
version = "0.3.6"
|
||||
@@ -445,17 +439,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys-next"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"redox_users",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dlv-list"
|
||||
version = "0.2.3"
|
||||
@@ -935,6 +918,24 @@ dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lz4_flex"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "177c079243f6867429aca5af5053747f57e329d44f0c58bebca078cd14873ec2"
|
||||
dependencies = [
|
||||
"twox-hash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
version = "0.1.8"
|
||||
@@ -1172,38 +1173,42 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bookfile",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
"futures",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"lz4_flex",
|
||||
"postgres",
|
||||
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
|
||||
"postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"postgres_ffi",
|
||||
"rand",
|
||||
"regex",
|
||||
"routerify",
|
||||
"rust-s3",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"slog",
|
||||
"slog-scope",
|
||||
"slog-stdlog",
|
||||
"slog-term",
|
||||
"signal-hook",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"toml",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
"yakv",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
]
|
||||
@@ -1301,27 +1306,9 @@ dependencies = [
|
||||
"fallible-iterator",
|
||||
"futures",
|
||||
"log",
|
||||
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
|
||||
"postgres-protocol",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff3e0f70d32e20923cabf2df02913be7c1842d4c772db8065c00fcfdd1d1bff3"
|
||||
dependencies = [
|
||||
"base64 0.13.0",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"hmac",
|
||||
"md-5",
|
||||
"memchr",
|
||||
"rand",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
"tokio-postgres",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1342,17 +1329,6 @@ dependencies = [
|
||||
"stringprep",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "430f4131e1b7657b0cd9a2b0c3408d77c9a43a042d300b8c77f981dffcc43a2f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"postgres-protocol 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.1"
|
||||
@@ -1360,7 +1336,7 @@ source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
|
||||
"postgres-protocol",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1423,16 +1399,9 @@ dependencies = [
|
||||
"lazy_static",
|
||||
"memchr",
|
||||
"parking_lot",
|
||||
"protobuf",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf"
|
||||
version = "2.24.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db50e77ae196458ccd3dc58a31ea1a90b0698ab1b7928d89f644c25d72070267"
|
||||
|
||||
[[package]]
|
||||
name = "proxy"
|
||||
version = "0.1.0"
|
||||
@@ -1443,11 +1412,12 @@ dependencies = [
|
||||
"hex",
|
||||
"md5",
|
||||
"rand",
|
||||
"reqwest",
|
||||
"rustls",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.2",
|
||||
"tokio-postgres",
|
||||
"zenith_utils",
|
||||
]
|
||||
|
||||
@@ -1545,6 +1515,15 @@ dependencies = [
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
|
||||
dependencies = [
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.25"
|
||||
@@ -1703,12 +1682,6 @@ dependencies = [
|
||||
"webpki",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.5"
|
||||
@@ -1866,12 +1839,32 @@ dependencies = [
|
||||
"opaque-debug",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42a568c8f2cd051a4d283bd6eb0343ac214c1b0f1ac19f93e1175b2dee38c73d"
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook"
|
||||
version = "0.3.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c98891d737e271a2954825ef19e46bd16bdb98e2746f2eec4f7a4ef7946efd1"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"signal-hook-registry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.4.0"
|
||||
@@ -1904,59 +1897,6 @@ version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527"
|
||||
|
||||
[[package]]
|
||||
name = "slog"
|
||||
version = "2.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06"
|
||||
|
||||
[[package]]
|
||||
name = "slog-async"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c60813879f820c85dbc4eabf3269befe374591289019775898d56a81a804fbdc"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"slog",
|
||||
"take_mut",
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-scope"
|
||||
version = "4.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f95a4b4c3274cd2869549da82b57ccc930859bdbf5bcea0424bc5f140b3c786"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"lazy_static",
|
||||
"slog",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-stdlog"
|
||||
version = "4.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8228ab7302adbf4fcb37e66f3cda78003feb521e7fd9e3847ec117a7784d0f5a"
|
||||
dependencies = [
|
||||
"log",
|
||||
"slog",
|
||||
"slog-scope",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-term"
|
||||
version = "2.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95c1e7e5aab61ced6006149ea772770b84a0d16ce0f7885def313e4829946d76"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"chrono",
|
||||
"slog",
|
||||
"term",
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.6.1"
|
||||
@@ -1979,6 +1919,12 @@ version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
|
||||
|
||||
[[package]]
|
||||
name = "static_assertions"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
||||
|
||||
[[package]]
|
||||
name = "stringprep"
|
||||
version = "0.1.2"
|
||||
@@ -2012,12 +1958,6 @@ dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "take_mut"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
|
||||
|
||||
[[package]]
|
||||
name = "tap"
|
||||
version = "1.0.1"
|
||||
@@ -2049,17 +1989,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "term"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f"
|
||||
dependencies = [
|
||||
"dirs-next",
|
||||
"rustversion",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.1.2"
|
||||
@@ -2135,9 +2064,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.8.1"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "98c8b05dc14c75ea83d63dd391100353789f5f24b8b3866542a5e85c8be8e985"
|
||||
checksum = "b4efe6fc2395938c8155973d7be49fe8d03a843726e285e100a8a383cc0154ce"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"bytes",
|
||||
@@ -2146,7 +2075,6 @@ dependencies = [
|
||||
"mio",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"parking_lot",
|
||||
"pin-project-lite",
|
||||
"signal-hook-registry",
|
||||
"tokio-macros",
|
||||
@@ -2189,31 +2117,8 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
"phf",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
|
||||
"postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d2b1383c7e4fb9a09e292c7c6afb7da54418d53b045f1c1fac7a911411a2b8b"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"futures",
|
||||
"log",
|
||||
"parking_lot",
|
||||
"percent-encoding",
|
||||
"phf",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"postgres-types 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -2261,30 +2166,95 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.26"
|
||||
version = "0.1.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
|
||||
checksum = "375a639232caf30edfc78e8d89b2d4c375515393e7af7e16f01cd96917fb2105"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052"
|
||||
checksum = "f4f480b8f81512e825f337ad51e94c1eb5d3bbdf2b363dcd01e2b19a9ffe3f8e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-log"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-serde"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.2.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e0d2eaa99c3c2e41547cfa109e910a68ea03823cccad4a0525dcbc9b01e8c71"
|
||||
dependencies = [
|
||||
"ansi_term 0.12.1",
|
||||
"chrono",
|
||||
"lazy_static",
|
||||
"matchers",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "1.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f559b464de2e2bdabcac6a210d12e9b5a5973c251e102c44c585c71d51bd78e"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"static_assertions",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.13.0"
|
||||
@@ -2377,29 +2347,29 @@ dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"clap",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
"fs2",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"pageserver",
|
||||
"postgres",
|
||||
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
|
||||
"postgres-protocol",
|
||||
"postgres_ffi",
|
||||
"regex",
|
||||
"routerify",
|
||||
"rust-s3",
|
||||
"serde",
|
||||
"slog",
|
||||
"slog-async",
|
||||
"slog-scope",
|
||||
"slog-stdlog",
|
||||
"slog-term",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
]
|
||||
|
||||
@@ -2599,6 +2569,14 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a"
|
||||
|
||||
[[package]]
|
||||
name = "yakv"
|
||||
version = "0.2.4"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"fs2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zenith"
|
||||
version = "0.1.0"
|
||||
@@ -2620,6 +2598,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"prometheus",
|
||||
]
|
||||
|
||||
@@ -2644,8 +2623,12 @@ dependencies = [
|
||||
"rustls-split",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-log",
|
||||
"tracing-subscriber",
|
||||
"webpki",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
|
||||
@@ -15,3 +15,7 @@ members = [
|
||||
# This is useful for profiling and, to some extent, debug.
|
||||
# Besides, debug info should not affect the performance.
|
||||
debug = true
|
||||
panic = 'abort'
|
||||
|
||||
[profile.dev]
|
||||
panic = 'abort'
|
||||
|
||||
36
Dockerfile
36
Dockerfile
@@ -10,39 +10,21 @@ FROM zenithdb/build:buster AS pg-build
|
||||
WORKDIR /zenith
|
||||
COPY ./vendor/postgres vendor/postgres
|
||||
COPY ./Makefile Makefile
|
||||
ENV BUILD_TYPE release
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
|
||||
|
||||
#
|
||||
# Calculate cargo dependencies.
|
||||
# This will always run, but only generate recipe.json with list of dependencies without
|
||||
# installing them.
|
||||
#
|
||||
FROM zenithdb/build:buster AS cargo-deps-inspect
|
||||
WORKDIR /zenith
|
||||
COPY . .
|
||||
RUN cargo chef prepare --recipe-path /zenith/recipe.json
|
||||
|
||||
#
|
||||
# Build cargo dependencies.
|
||||
# This temp cantainner should be rebuilt only if recipe.json was changed.
|
||||
#
|
||||
FROM zenithdb/build:buster AS deps-build
|
||||
WORKDIR /zenith
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY --from=cargo-deps-inspect /usr/local/cargo/bin/cargo-chef /usr/local/cargo/bin/
|
||||
COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
|
||||
RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
|
||||
RUN rm -rf postgres_install/build
|
||||
|
||||
#
|
||||
# Build zenith binaries
|
||||
#
|
||||
# TODO: build cargo deps as separate layer. We used cargo-chef before but that was
|
||||
# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
|
||||
#
|
||||
FROM zenithdb/build:buster AS build
|
||||
WORKDIR /zenith
|
||||
COPY . .
|
||||
# Copy cached dependencies
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY --from=deps-build /zenith/target target
|
||||
COPY --from=deps-build /usr/local/cargo/ /usr/local/cargo/
|
||||
|
||||
COPY . .
|
||||
RUN cargo build --release
|
||||
|
||||
#
|
||||
@@ -51,11 +33,11 @@ RUN cargo build --release
|
||||
FROM debian:buster-slim
|
||||
WORKDIR /data
|
||||
|
||||
RUN apt-get update && apt-get -yq install librocksdb-dev libseccomp-dev openssl && \
|
||||
RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \
|
||||
mkdir zenith_install
|
||||
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/proxy /usr/local/bin
|
||||
COPY --from=pg-build /zenith/tmp_install postgres_install
|
||||
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||
|
||||
@@ -81,7 +81,7 @@ FROM alpine:3.13
|
||||
RUN apk add --update openssl build-base libseccomp-dev
|
||||
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/proxy /usr/local/bin
|
||||
COPY --from=pg-build /zenith/tmp_install /usr/local
|
||||
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||
|
||||
@@ -9,7 +9,7 @@ WORKDIR /zenith
|
||||
# Install postgres and zenith build dependencies
|
||||
# clang is for rocksdb
|
||||
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libseccomp-dev pkg-config libssl-dev librocksdb-dev clang
|
||||
libseccomp-dev pkg-config libssl-dev clang
|
||||
|
||||
# Install rust tools
|
||||
RUN rustup component add clippy && cargo install cargo-chef cargo-audit
|
||||
RUN rustup component add clippy && cargo install cargo-audit
|
||||
|
||||
58
Makefile
58
Makefile
@@ -6,34 +6,55 @@ else
|
||||
SECCOMP =
|
||||
endif
|
||||
|
||||
#
|
||||
# We differentiate between release / debug build types using the BUILD_TYPE
|
||||
# environment variable.
|
||||
#
|
||||
BUILD_TYPE ?= debug
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug
|
||||
PG_CFLAGS = -O2 -g3 $(CFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 $(CFLAGS)
|
||||
else
|
||||
$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
# Choose whether we should be silent or verbose
|
||||
CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
|
||||
# Fix for a corner case when make doesn't pass a jobserver
|
||||
CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
||||
|
||||
# This option has a side effect of passing make jobserver to cargo.
|
||||
# However, we shouldn't do this if `make -n` (--dry-run) has been asked.
|
||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||
# Force cargo not to print progress bar
|
||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
|
||||
#
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
.PHONY: all
|
||||
all: zenith postgres
|
||||
|
||||
# We don't want to run 'cargo build' in parallel with the postgres build,
|
||||
# because interleaving cargo build output with postgres build output looks
|
||||
# confusing. Also, 'cargo build' is parallel on its own, so it would be too
|
||||
# much parallelism. (Recursive invocation of postgres target still gets any
|
||||
# '-j' flag from the command line, so 'make -j' is still useful.)
|
||||
.NOTPARALLEL:
|
||||
|
||||
### Zenith Rust bits
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: zenith
|
||||
zenith: postgres-headers
|
||||
cargo build
|
||||
+@echo "Compiling Zenith"
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||
|
||||
### PostgreSQL parts
|
||||
tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
|
||||
--enable-cassert \
|
||||
--enable-debug \
|
||||
--enable-depend \
|
||||
../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) \
|
||||
$(SECCOMP) \
|
||||
--prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
@@ -47,10 +68,10 @@ postgres-headers: postgres-configure
|
||||
+@echo "Installing PostgreSQL headers"
|
||||
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
|
||||
|
||||
|
||||
# Compile and install PostgreSQL and contrib/zenith
|
||||
.PHONY: postgres
|
||||
postgres: postgres-configure
|
||||
postgres: postgres-configure \
|
||||
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
||||
+@echo "Compiling contrib/zenith"
|
||||
@@ -58,18 +79,21 @@ postgres: postgres-configure
|
||||
+@echo "Compiling contrib/zenith_test_utils"
|
||||
$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean:
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean:
|
||||
cd tmp_install/build && ${MAKE} clean
|
||||
cargo clean
|
||||
cd tmp_install/build && $(MAKE) clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
# This removes everything
|
||||
.PHONY: distclean
|
||||
distclean:
|
||||
rm -rf tmp_install
|
||||
cargo clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
.PHONY: fmt
|
||||
fmt:
|
||||
|
||||
25
README.md
25
README.md
@@ -6,7 +6,7 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus
|
||||
|
||||
A Zenith installation consists of Compute nodes and Storage engine.
|
||||
|
||||
Compute nodes are stateles PostgreSQL nodes, backed by zenith storage.
|
||||
Compute nodes are stateless PostgreSQL nodes, backed by zenith storage.
|
||||
|
||||
Zenith storage engine consists of two major components:
|
||||
- Pageserver. Scalable storage backend for compute nodes.
|
||||
@@ -25,10 +25,10 @@ Pageserver consists of:
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang
|
||||
libssl-dev clang pkg-config libpq-dev
|
||||
```
|
||||
|
||||
[Rust] 1.52 or later is also required.
|
||||
[Rust] 1.55 or later is also required.
|
||||
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
|
||||
@@ -108,6 +108,13 @@ postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
```
|
||||
|
||||
6. If you want to run tests afterwards (see below), you have to stop pageserver and all postgres instances you have just started:
|
||||
```sh
|
||||
> ./target/debug/zenith pg stop migration_check
|
||||
> ./target/debug/zenith pg stop main
|
||||
> ./target/debug/zenith stop
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
@@ -125,9 +132,19 @@ Now we use README files to cover design ideas and overall architecture for each
|
||||
|
||||
To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`
|
||||
|
||||
### Postgres-specific terms
|
||||
|
||||
Due to Zenith's very close relation with PostgreSQL internals, there are numerous specific terms used.
|
||||
Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.
|
||||
|
||||
To get more familiar with this aspect, refer to:
|
||||
|
||||
- [Zenith glossary](/docs/glossary.md)
|
||||
- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html)
|
||||
- Other PostgreSQL documentation and sources (Zenith fork sources can be found [here](https://github.com/zenithdb/postgres))
|
||||
|
||||
## Join the development
|
||||
|
||||
- Read `CONTRIBUTING.md` to learn about project code style and practices.
|
||||
- Use glossary in [/docs/glossary.md](/docs/glossary.md)
|
||||
- To get familiar with a source tree layout, use [/docs/sourcetree.md](/docs/sourcetree.md).
|
||||
- To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html
|
||||
|
||||
@@ -16,6 +16,7 @@ toml = "0.5"
|
||||
lazy_static = "1.4"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
thiserror = "1"
|
||||
bytes = "1.0.1"
|
||||
nix = "0.20"
|
||||
url = "2.2.2"
|
||||
|
||||
@@ -1,18 +1,16 @@
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs::{self, File};
|
||||
use std::io::Write;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::process::Command;
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{collections::BTreeMap, path::PathBuf};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use postgres_ffi::pg_constants;
|
||||
use regex::Regex;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
@@ -20,6 +18,7 @@ use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage::PageServerNode;
|
||||
|
||||
//
|
||||
@@ -85,23 +84,53 @@ impl ComputeControlPlane {
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: see also parse_point_in_time in branches.rs.
|
||||
fn parse_point_in_time(
|
||||
&self,
|
||||
tenantid: ZTenantId,
|
||||
s: &str,
|
||||
) -> Result<(ZTimelineId, Option<Lsn>)> {
|
||||
let mut strings = s.split('@');
|
||||
let name = strings.next().unwrap();
|
||||
|
||||
let lsn: Option<Lsn>;
|
||||
if let Some(lsnstr) = strings.next() {
|
||||
lsn = Some(
|
||||
Lsn::from_str(lsnstr)
|
||||
.with_context(|| "invalid LSN in point-in-time specification")?,
|
||||
);
|
||||
} else {
|
||||
lsn = None
|
||||
}
|
||||
|
||||
// Resolve the timeline ID, given the human-readable branch name
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
.branch_get_by_name(&tenantid, name)?
|
||||
.timeline_id;
|
||||
|
||||
Ok((timeline_id, lsn))
|
||||
}
|
||||
|
||||
pub fn new_node(
|
||||
&mut self,
|
||||
tenantid: ZTenantId,
|
||||
branch_name: &str,
|
||||
name: &str,
|
||||
timeline_spec: &str,
|
||||
port: Option<u16>,
|
||||
) -> Result<Arc<PostgresNode>> {
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
.branch_get_by_name(&tenantid, branch_name)?
|
||||
.timeline_id;
|
||||
// Resolve the human-readable timeline spec into timeline ID and LSN
|
||||
let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?;
|
||||
|
||||
let port = port.unwrap_or_else(|| self.get_port());
|
||||
let node = Arc::new(PostgresNode {
|
||||
name: branch_name.to_owned(),
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
|
||||
name: name.to_owned(),
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
is_test: false,
|
||||
timelineid: timeline_id,
|
||||
timelineid,
|
||||
lsn,
|
||||
tenantid,
|
||||
uses_wal_proposer: false,
|
||||
});
|
||||
@@ -126,6 +155,7 @@ pub struct PostgresNode {
|
||||
pageserver: Arc<PageServerNode>,
|
||||
is_test: bool,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
|
||||
pub tenantid: ZTenantId,
|
||||
uses_wal_proposer: bool,
|
||||
}
|
||||
@@ -143,76 +173,28 @@ impl PostgresNode {
|
||||
);
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
|
||||
static ref CONF_TIMELINE_RE: Regex =
|
||||
Regex::new(r"(?m)^\s*zenith.zenith_timeline\s*=\s*'(\w+)'\s*$").unwrap();
|
||||
static ref CONF_TENANT_RE: Regex =
|
||||
Regex::new(r"(?m)^\s*zenith.zenith_tenant\s*=\s*'(\w+)'\s*$").unwrap();
|
||||
}
|
||||
|
||||
// parse data directory name
|
||||
let fname = entry.file_name();
|
||||
let name = fname.to_str().unwrap().to_string();
|
||||
|
||||
// find out tcp port in config file
|
||||
// Read config file into memory
|
||||
let cfg_path = entry.path().join("postgresql.conf");
|
||||
let config = fs::read_to_string(cfg_path.clone()).with_context(|| {
|
||||
format!(
|
||||
"failed to read config file in {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
)
|
||||
})?;
|
||||
let cfg_path_str = cfg_path.to_string_lossy();
|
||||
let mut conf_file = File::open(&cfg_path)
|
||||
.with_context(|| format!("failed to open config file in {}", cfg_path_str))?;
|
||||
let conf = PostgresConf::read(&mut conf_file)
|
||||
.with_context(|| format!("failed to read config file in {}", cfg_path_str))?;
|
||||
|
||||
// parse port
|
||||
let err_msg = format!(
|
||||
"failed to find port definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let port: u16 = CONF_PORT_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
// Read a few options from the config file
|
||||
let context = format!("in config file {}", cfg_path_str);
|
||||
let port: u16 = conf.parse_field("port", &context)?;
|
||||
let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
|
||||
let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
|
||||
let uses_wal_proposer = conf.get("wal_acceptors").is_some();
|
||||
|
||||
// parse timeline
|
||||
let err_msg = format!(
|
||||
"failed to find timeline definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let timelineid: ZTimelineId = CONF_TIMELINE_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
// parse tenant
|
||||
let err_msg = format!(
|
||||
"failed to find tenant definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let tenantid = CONF_TENANT_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
let uses_wal_proposer = config.contains("wal_acceptors");
|
||||
// parse recovery_target_lsn, if any
|
||||
let recovery_target_lsn: Option<Lsn> =
|
||||
conf.parse_field_optional("recovery_target_lsn", &context)?;
|
||||
|
||||
// ok now
|
||||
Ok(PostgresNode {
|
||||
@@ -222,6 +204,7 @@ impl PostgresNode {
|
||||
pageserver: Arc::clone(pageserver),
|
||||
is_test: false,
|
||||
timelineid,
|
||||
lsn: recovery_target_lsn,
|
||||
tenantid,
|
||||
uses_wal_proposer,
|
||||
})
|
||||
@@ -229,18 +212,24 @@ impl PostgresNode {
|
||||
|
||||
fn sync_walkeepers(&self) -> Result<Lsn> {
|
||||
let pg_path = self.env.pg_bin_dir().join("postgres");
|
||||
let sync_output = Command::new(pg_path)
|
||||
let sync_handle = Command::new(pg_path)
|
||||
.arg("--sync-safekeepers")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGDATA", self.pgdata().to_str().unwrap())
|
||||
.output()
|
||||
.with_context(|| "sync-walkeepers failed")?;
|
||||
.stdout(Stdio::piped())
|
||||
// Comment this to avoid capturing stderr (useful if command hangs)
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"sync-walkeepers failed: '{}'",
|
||||
"sync-safekeepers failed: '{}'",
|
||||
String::from_utf8_lossy(&sync_output.stderr)
|
||||
);
|
||||
}
|
||||
@@ -277,7 +266,7 @@ impl PostgresNode {
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
tar::Archive::new(copyreader)
|
||||
.unpack(&self.pgdata())
|
||||
.with_context(|| "extracting page backup failed")?;
|
||||
.with_context(|| "extracting base backup failed")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -301,84 +290,76 @@ impl PostgresNode {
|
||||
// Connect to a page server, get base backup, and untar it to initialize a
|
||||
// new data directory
|
||||
fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
|
||||
File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;
|
||||
|
||||
let mut conf = PostgresConf::new();
|
||||
conf.append("max_wal_senders", "10");
|
||||
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
|
||||
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"max_wal_senders = 10\n\
|
||||
wal_log_hints = on\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
fsync = off\n\
|
||||
max_connections = 100\n\
|
||||
wal_sender_timeout = 0\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n",
|
||||
address = self.address.ip(),
|
||||
port = self.address.port()
|
||||
),
|
||||
)?;
|
||||
conf.append("wal_log_hints", "on");
|
||||
conf.append("max_replication_slots", "10");
|
||||
conf.append("hot_standby", "on");
|
||||
conf.append("shared_buffers", "1MB");
|
||||
conf.append("max_wal_size", "100GB");
|
||||
conf.append("fsync", "off");
|
||||
conf.append("max_connections", "100");
|
||||
conf.append("wal_sender_timeout", "0");
|
||||
conf.append("wal_level", "replica");
|
||||
conf.append("listen_addresses", &self.address.ip().to_string());
|
||||
conf.append("port", &self.address.port().to_string());
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeeper or
|
||||
// page server yet. (gh issue #349)
|
||||
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;
|
||||
conf.append("wal_keep_size", "10TB");
|
||||
|
||||
// set up authentication
|
||||
let password = if let AuthType::ZenithJWT = auth_type {
|
||||
"$ZENITH_AUTH_TOKEN"
|
||||
} else {
|
||||
""
|
||||
// Configure the node to fetch pages from pageserver
|
||||
let pageserver_connstr = {
|
||||
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
|
||||
|
||||
// Set up authentication
|
||||
//
|
||||
// $ZENITH_AUTH_TOKEN will be replaced with value from environment
|
||||
// variable during compute pg startup. It is done this way because
|
||||
// otherwise user will be able to retrieve the value using SHOW
|
||||
// command or pg_settings
|
||||
let password = if let AuthType::ZenithJWT = auth_type {
|
||||
"$ZENITH_AUTH_TOKEN"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
format!("host={} port={} password={}", host, port, password)
|
||||
};
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
concat!(
|
||||
"shared_preload_libraries = zenith\n",
|
||||
// $ZENITH_AUTH_TOKEN will be replaced with value from environment variable during compute pg startup
|
||||
// it is done this way because otherwise user will be able to retrieve the value using SHOW command or pg_settings
|
||||
"zenith.page_server_connstring = 'host={} port={} password={}'\n",
|
||||
"zenith.zenith_timeline='{}'\n",
|
||||
"zenith.zenith_tenant='{}'\n",
|
||||
),
|
||||
host, port, password, self.timelineid, self.tenantid,
|
||||
)
|
||||
.as_str(),
|
||||
)?;
|
||||
conf.append("shared_preload_libraries", "zenith");
|
||||
conf.append_line("");
|
||||
conf.append("zenith.page_server_connstring", &pageserver_connstr);
|
||||
conf.append("zenith.zenith_tenant", &self.tenantid.to_string());
|
||||
conf.append("zenith.zenith_timeline", &self.timelineid.to_string());
|
||||
if let Some(lsn) = self.lsn {
|
||||
conf.append("recovery_target_lsn", &lsn.to_string());
|
||||
}
|
||||
conf.append_line("");
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
concat!(
|
||||
"synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
|
||||
"zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
),
|
||||
self.connstr(),
|
||||
)
|
||||
.as_str(),
|
||||
)?;
|
||||
conf.append("synchronous_standby_names", "pageserver"); // TODO: add a new function arg?
|
||||
conf.append("zenith.callmemaybe_connstring", &self.connstr());
|
||||
|
||||
let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
|
||||
file.write_all(conf.to_string().as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_basebackup(&self) -> Result<()> {
|
||||
let lsn = if self.uses_wal_proposer {
|
||||
// LSN WAL_SEGMENT_SIZE means that it is bootstrap and we need to download just
|
||||
let backup_lsn = if let Some(lsn) = self.lsn {
|
||||
Some(lsn)
|
||||
} else if self.uses_wal_proposer {
|
||||
// LSN 0 means that it is bootstrap and we need to download just
|
||||
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
|
||||
// procedure evolves quite actively right now, so let's think about it again
|
||||
// when things would be more stable (TODO).
|
||||
let lsn = self.sync_walkeepers()?;
|
||||
if lsn == Lsn(pg_constants::WAL_SEGMENT_SIZE as u64) {
|
||||
if lsn == Lsn(0) {
|
||||
None
|
||||
} else {
|
||||
Some(lsn)
|
||||
@@ -387,7 +368,7 @@ impl PostgresNode {
|
||||
None
|
||||
};
|
||||
|
||||
self.do_basebackup(lsn)?;
|
||||
self.do_basebackup(backup_lsn)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -409,14 +390,6 @@ impl PostgresNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append_conf(&self, config: &str, opts: &str) -> Result<()> {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata().join(config).to_str().unwrap())?
|
||||
.write_all(opts.as_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
|
||||
let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl");
|
||||
let mut cmd = Command::new(pg_ctl_path);
|
||||
@@ -472,6 +445,10 @@ impl PostgresNode {
|
||||
// 3. Load basebackup
|
||||
self.load_basebackup()?;
|
||||
|
||||
if self.lsn.is_some() {
|
||||
File::create(self.pgdata().join("standby.signal"))?;
|
||||
}
|
||||
|
||||
// 4. Finally start the compute node postgres
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"], auth_token)
|
||||
@@ -482,13 +459,22 @@ impl PostgresNode {
|
||||
}
|
||||
|
||||
pub fn stop(&self, destroy: bool) -> Result<()> {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
|
||||
// If we are going to destroy data directory,
|
||||
// use immediate shutdown mode, otherwise,
|
||||
// shutdown gracefully to leave the data directory sane.
|
||||
//
|
||||
// Compute node always starts from scratch, so stop
|
||||
// without destroy only used for testing and debugging.
|
||||
//
|
||||
if destroy {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
self.pgdata().to_str().unwrap()
|
||||
);
|
||||
fs::remove_dir_all(&self.pgdata())?;
|
||||
} else {
|
||||
self.pg_ctl(&["stop"], &None)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -509,9 +495,7 @@ impl PostgresNode {
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
assert!(output.status.success(), "whoami failed");
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ use std::path::Path;
|
||||
|
||||
pub mod compute;
|
||||
pub mod local_env;
|
||||
pub mod postgresql_conf;
|
||||
pub mod storage;
|
||||
|
||||
/// Read a PID file
|
||||
|
||||
@@ -4,27 +4,24 @@
|
||||
// Now it also provides init method which acts like a stub for proper installation
|
||||
// script which will use local paths.
|
||||
//
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use hex;
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::{collections::BTreeMap, env};
|
||||
use url::Url;
|
||||
use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
pub type Remotes = BTreeMap<String, String>;
|
||||
|
||||
//
|
||||
// This data structures represent deserialized zenith CLI config
|
||||
//
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct LocalEnv {
|
||||
// Pageserver connection strings
|
||||
pub pageserver_connstring: String,
|
||||
// Pageserver connection settings
|
||||
pub pageserver_pg_port: u16,
|
||||
pub pageserver_http_port: u16,
|
||||
|
||||
// Base directory for both pageserver and compute nodes
|
||||
pub base_data_dir: PathBuf,
|
||||
@@ -35,8 +32,8 @@ pub struct LocalEnv {
|
||||
// to four separate paths and match OS-specific installation layout.
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
|
||||
// Path to pageserver binary. Empty for remote pageserver.
|
||||
pub zenith_distrib_dir: Option<PathBuf>,
|
||||
// Path to pageserver binary.
|
||||
pub zenith_distrib_dir: PathBuf,
|
||||
|
||||
// keeping tenant id in config to reduce copy paste when running zenith locally with single tenant
|
||||
#[serde(with = "hex")]
|
||||
@@ -50,8 +47,6 @@ pub struct LocalEnv {
|
||||
|
||||
// used to issue tokens during e.g pg start
|
||||
pub private_key_path: PathBuf,
|
||||
|
||||
pub remotes: Remotes,
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
@@ -64,11 +59,7 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> Result<PathBuf> {
|
||||
Ok(self
|
||||
.zenith_distrib_dir
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("Can not manage remote pageserver"))?
|
||||
.join("pageserver"))
|
||||
Ok(self.zenith_distrib_dir.join("pageserver"))
|
||||
}
|
||||
|
||||
pub fn pg_data_dirs_path(&self) -> PathBuf {
|
||||
@@ -98,7 +89,8 @@ fn base_path() -> PathBuf {
|
||||
// Initialize a new Zenith repository
|
||||
//
|
||||
pub fn init(
|
||||
remote_pageserver: Option<&str>,
|
||||
pageserver_pg_port: u16,
|
||||
pageserver_http_port: u16,
|
||||
tenantid: ZTenantId,
|
||||
auth_type: AuthType,
|
||||
) -> Result<()> {
|
||||
@@ -165,39 +157,22 @@ pub fn init(
|
||||
let auth_token =
|
||||
encode_from_key_path(&Claims::new(None, Scope::PageServerApi), &private_key_path)?;
|
||||
|
||||
let conf = if let Some(addr) = remote_pageserver {
|
||||
// check that addr is parsable
|
||||
let _uri = Url::parse(addr).map_err(|e| anyhow!("{}: {}", addr, e))?;
|
||||
// Find zenith binaries.
|
||||
let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
if !zenith_distrib_dir.join("pageserver").exists() {
|
||||
anyhow::bail!("Can't find pageserver binary.",);
|
||||
}
|
||||
|
||||
LocalEnv {
|
||||
pageserver_connstring: format!("postgresql://{}/", addr),
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir: None,
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
tenantid,
|
||||
auth_token,
|
||||
auth_type,
|
||||
private_key_path,
|
||||
}
|
||||
} else {
|
||||
// Find zenith binaries.
|
||||
let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
if !zenith_distrib_dir.join("pageserver").exists() {
|
||||
anyhow::bail!("Can't find pageserver binary.",);
|
||||
}
|
||||
|
||||
LocalEnv {
|
||||
pageserver_connstring: "postgresql://127.0.0.1:6400".to_string(),
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir: Some(zenith_distrib_dir),
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
tenantid,
|
||||
auth_token,
|
||||
auth_type,
|
||||
private_key_path,
|
||||
}
|
||||
let conf = LocalEnv {
|
||||
pageserver_pg_port,
|
||||
pageserver_http_port,
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir,
|
||||
base_data_dir: base_path,
|
||||
tenantid,
|
||||
auth_token,
|
||||
auth_type,
|
||||
private_key_path,
|
||||
};
|
||||
|
||||
fs::create_dir_all(conf.pg_data_dirs_path())?;
|
||||
@@ -225,12 +200,3 @@ pub fn load_config() -> Result<LocalEnv> {
|
||||
let config = fs::read_to_string(repopath.join("config"))?;
|
||||
toml::from_str(config.as_str()).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
// Save config. We use that to change set of remotes from CLI itself.
|
||||
pub fn save_config(conf: &LocalEnv) -> Result<()> {
|
||||
let config_path = base_path().join("config");
|
||||
let conf_str = toml::to_string_pretty(conf)?;
|
||||
|
||||
fs::write(config_path, conf_str)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
228
control_plane/src/postgresql_conf.rs
Normal file
228
control_plane/src/postgresql_conf.rs
Normal file
@@ -0,0 +1,228 @@
|
||||
///
|
||||
/// Module for parsing postgresql.conf file.
|
||||
///
|
||||
/// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
|
||||
/// enough to extract a few settings we need in Zenith, assuming you don't do
|
||||
/// funny stuff like include-directives or funny escaping.
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::BufRead;
|
||||
use std::str::FromStr;
|
||||
|
||||
/// In-memory representation of a postgresql.conf file
|
||||
#[derive(Default)]
|
||||
pub struct PostgresConf {
|
||||
lines: Vec<String>,
|
||||
hash: HashMap<String, String>,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap();
|
||||
}
|
||||
|
||||
impl PostgresConf {
|
||||
pub fn new() -> PostgresConf {
|
||||
PostgresConf::default()
|
||||
}
|
||||
|
||||
/// Read file into memory
|
||||
pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
|
||||
let mut result = Self::new();
|
||||
|
||||
for line in std::io::BufReader::new(read).lines() {
|
||||
let line = line?;
|
||||
|
||||
// Store each line in a vector, in original format
|
||||
result.lines.push(line.clone());
|
||||
|
||||
// Also parse each line and insert key=value lines into a hash map.
|
||||
//
|
||||
// FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
|
||||
// But it's close enough for our usage.
|
||||
let line = line.trim();
|
||||
if line.starts_with('#') {
|
||||
// comment, ignore
|
||||
continue;
|
||||
} else if let Some(caps) = CONF_LINE_RE.captures(line) {
|
||||
let name = caps.get(1).unwrap().as_str();
|
||||
let raw_val = caps.get(2).unwrap().as_str();
|
||||
|
||||
if let Ok(val) = deescape_str(raw_val) {
|
||||
// Note: if there's already an entry in the hash map for
|
||||
// this key, this will replace it. That's the behavior what
|
||||
// we want; when PostgreSQL reads the file, each line
|
||||
// overrides any previous value for the same setting.
|
||||
result.hash.insert(name.to_string(), val.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Return the current value of 'option'
|
||||
pub fn get(&self, option: &str) -> Option<&str> {
|
||||
self.hash.get(option).map(|x| x.as_ref())
|
||||
}
|
||||
|
||||
/// Return the current value of a field, parsed to the right datatype.
|
||||
///
|
||||
/// This calls the FromStr::parse() function on the value of the field. If
|
||||
/// the field does not exist, or parsing fails, returns an error.
|
||||
///
|
||||
pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
self.get(field_name)
|
||||
.ok_or_else(|| anyhow!("could not find '{}' option {}", field_name, context))?
|
||||
.parse::<T>()
|
||||
.with_context(|| format!("could not parse '{}' option {}", field_name, context))
|
||||
}
|
||||
|
||||
pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
if let Some(val) = self.get(field_name) {
|
||||
let result = val
|
||||
.parse::<T>()
|
||||
.with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
|
||||
|
||||
Ok(Some(result))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Note: if you call this multiple times for the same option, the config
|
||||
/// file will a line for each call. It would be nice to have a function
|
||||
/// to change an existing line, but that's a TODO.
|
||||
///
|
||||
pub fn append(&mut self, option: &str, value: &str) {
|
||||
self.lines
|
||||
.push(format!("{}={}\n", option, escape_str(value)));
|
||||
self.hash.insert(option.to_string(), value.to_string());
|
||||
}
|
||||
|
||||
/// Append an arbitrary non-setting line to the config file
|
||||
pub fn append_line(&mut self, line: &str) {
|
||||
self.lines.push(line.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for PostgresConf {
|
||||
/// Return the whole configuration file as a string
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for line in self.lines.iter() {
|
||||
f.write_str(line)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Escape a value for putting in postgresql.conf.
|
||||
fn escape_str(s: &str) -> String {
|
||||
// If the string doesn't contain anything that needs quoting or escaping, return it
|
||||
// as it is.
|
||||
//
|
||||
// The first part of the regex, before the '|', matches the INTEGER rule in the
|
||||
// PostgreSQL flex grammar (guc-file.l). It matches plain integers like "123" and
|
||||
// "-123", and also accepts units like "10MB". The second part of the regex matches
|
||||
// the UNQUOTED_STRING rule, and accepts strings that contain a single word, beginning
|
||||
// with a letter. That covers words like "off" or "posix". Everything else is quoted.
|
||||
//
|
||||
// This regex is a bit more conservative than the rules in guc-file.l, so we quote some
|
||||
// strings that PostgreSQL would accept without quoting, but that's OK.
|
||||
lazy_static! {
|
||||
static ref UNQUOTED_RE: Regex =
|
||||
Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap();
|
||||
}
|
||||
if UNQUOTED_RE.is_match(s) {
|
||||
s.to_string()
|
||||
} else {
|
||||
// Otherwise escape and quote it
|
||||
let s = s
|
||||
.replace('\\', "\\\\")
|
||||
.replace('\n', "\\n")
|
||||
.replace('\'', "''");
|
||||
|
||||
"\'".to_owned() + &s + "\'"
|
||||
}
|
||||
}
|
||||
|
||||
/// De-escape a possibly-quoted value.
|
||||
///
|
||||
/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
|
||||
/// does this.
|
||||
fn deescape_str(s: &str) -> Result<String> {
|
||||
// If the string has a quote at the beginning and end, strip them out.
|
||||
if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
|
||||
let mut result = String::new();
|
||||
|
||||
let mut iter = s[1..(s.len() - 1)].chars().peekable();
|
||||
while let Some(c) = iter.next() {
|
||||
let newc = if c == '\\' {
|
||||
match iter.next() {
|
||||
Some('b') => '\x08',
|
||||
Some('f') => '\x0c',
|
||||
Some('n') => '\n',
|
||||
Some('r') => '\r',
|
||||
Some('t') => '\t',
|
||||
Some('0'..='7') => {
|
||||
// TODO
|
||||
bail!("octal escapes not supported");
|
||||
}
|
||||
Some(n) => n,
|
||||
None => break,
|
||||
}
|
||||
} else if c == '\'' && iter.peek() == Some(&'\'') {
|
||||
// doubled quote becomes just one quote
|
||||
iter.next().unwrap()
|
||||
} else {
|
||||
c
|
||||
};
|
||||
|
||||
result.push(newc);
|
||||
}
|
||||
Ok(result)
|
||||
} else {
|
||||
Ok(s.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_postgresql_conf_escapes() -> Result<()> {
|
||||
assert_eq!(escape_str("foo bar"), "'foo bar'");
|
||||
// these don't need to be quoted
|
||||
assert_eq!(escape_str("foo"), "foo");
|
||||
assert_eq!(escape_str("123"), "123");
|
||||
assert_eq!(escape_str("+123"), "+123");
|
||||
assert_eq!(escape_str("-10"), "-10");
|
||||
assert_eq!(escape_str("1foo"), "1foo");
|
||||
assert_eq!(escape_str("foo1"), "foo1");
|
||||
assert_eq!(escape_str("10MB"), "10MB");
|
||||
assert_eq!(escape_str("-10kB"), "-10kB");
|
||||
|
||||
// these need quoting and/or escaping
|
||||
assert_eq!(escape_str("foo bar"), "'foo bar'");
|
||||
assert_eq!(escape_str("fo'o"), "'fo''o'");
|
||||
assert_eq!(escape_str("fo\no"), "'fo\\no'");
|
||||
assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
|
||||
assert_eq!(escape_str("10 cats"), "'10 cats'");
|
||||
|
||||
// Test de-escaping
|
||||
assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
|
||||
assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
|
||||
assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
|
||||
|
||||
// octal-escapes are currently not supported
|
||||
assert!(deescape_str("'foo\\7\\07\\007'").is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,17 +1,19 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::{io, result, thread};
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Result};
|
||||
use anyhow::{anyhow, bail};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
|
||||
use postgres::{Config, NoTls};
|
||||
use reqwest::blocking::{Client, RequestBuilder};
|
||||
use reqwest::{IntoUrl, Method, StatusCode};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
@@ -20,7 +22,38 @@ use crate::read_pidfile;
|
||||
use pageserver::branches::BranchInfo;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
const HTTP_BASE_URL: &str = "http://127.0.0.1:9898/v1";
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverHttpError {
|
||||
#[error("Reqwest error: {0}")]
|
||||
Transport(#[from] reqwest::Error),
|
||||
|
||||
#[error("Error: {0}")]
|
||||
Response(String),
|
||||
}
|
||||
|
||||
type Result<T> = result::Result<T, PageserverHttpError>;
|
||||
|
||||
pub trait ResponseErrorMessageExt: Sized {
|
||||
fn error_from_body(self) -> Result<Self>;
|
||||
}
|
||||
|
||||
impl ResponseErrorMessageExt for Response {
|
||||
fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
}
|
||||
|
||||
// reqwest do not export it's error construction utility functions, so lets craft the message ourselves
|
||||
let url = self.url().to_owned();
|
||||
Err(PageserverHttpError::Response(
|
||||
match self.json::<HttpErrorBody>() {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Control routines for pageserver.
|
||||
@@ -46,27 +79,36 @@ impl PageServerNode {
|
||||
|
||||
PageServerNode {
|
||||
kill_on_exit: false,
|
||||
pg_connection_config: Self::default_config(password), // default
|
||||
pg_connection_config: Self::pageserver_connection_config(
|
||||
password,
|
||||
env.pageserver_pg_port,
|
||||
),
|
||||
env: env.clone(),
|
||||
http_client: Client::new(),
|
||||
http_base_url: HTTP_BASE_URL.to_owned(),
|
||||
http_base_url: format!("http://localhost:{}/v1", env.pageserver_http_port),
|
||||
}
|
||||
}
|
||||
|
||||
fn default_config(password: &str) -> Config {
|
||||
format!("postgresql://no_user:{}@localhost:64000/no_db", password)
|
||||
fn pageserver_connection_config(password: &str, port: u16) -> Config {
|
||||
format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> Result<()> {
|
||||
pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> anyhow::Result<()> {
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let listen_pg = format!("localhost:{}", self.env.pageserver_pg_port);
|
||||
let listen_http = format!("localhost:{}", self.env.pageserver_http_port);
|
||||
let mut args = vec![
|
||||
"--init",
|
||||
"-D",
|
||||
self.env.base_data_dir.to_str().unwrap(),
|
||||
"--postgres-distrib",
|
||||
self.env.pg_distrib_dir.to_str().unwrap(),
|
||||
"--listen-pg",
|
||||
&listen_pg,
|
||||
"--listen-http",
|
||||
&listen_http,
|
||||
];
|
||||
|
||||
if enable_auth {
|
||||
@@ -100,12 +142,13 @@ impl PageServerNode {
|
||||
self.repo_path().join("pageserver.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!(
|
||||
"Starting pageserver at '{}' in {}",
|
||||
pub fn start(&self) -> anyhow::Result<()> {
|
||||
print!(
|
||||
"Starting pageserver at '{}' in '{}'",
|
||||
connection_address(&self.pg_connection_config),
|
||||
self.repo_path().display()
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
cmd.args(&["-D", self.repo_path().to_str().unwrap()])
|
||||
@@ -122,41 +165,79 @@ impl PageServerNode {
|
||||
|
||||
// It takes a while for the page server to start up. Wait until it is
|
||||
// open for business.
|
||||
for retries in 1..15 {
|
||||
const RETRIES: i8 = 15;
|
||||
for retries in 1..RETRIES {
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("Pageserver started");
|
||||
println!("\nPageserver started");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => {
|
||||
println!(
|
||||
"Pageserver not responding yet, err {} retrying ({})...",
|
||||
err, retries
|
||||
);
|
||||
match err {
|
||||
PageserverHttpError::Transport(err) => {
|
||||
if err.is_connect() && retries < 5 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
} else {
|
||||
if retries == 5 {
|
||||
println!() // put a line break after dots for second message
|
||||
}
|
||||
println!(
|
||||
"Pageserver not responding yet, err {} retrying ({})...",
|
||||
err, retries
|
||||
);
|
||||
}
|
||||
}
|
||||
PageserverHttpError::Response(msg) => {
|
||||
bail!("pageserver failed to start: {} ", msg)
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("pageserver failed to start");
|
||||
bail!("pageserver failed to start in {} seconds", RETRIES);
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
let pid = read_pidfile(&self.pid_file())?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to kill pageserver with pid {}", pid);
|
||||
if immediate {
|
||||
println!("Stop pageserver immediately");
|
||||
if kill(pid, Signal::SIGQUIT).is_err() {
|
||||
bail!("Failed to kill pageserver with pid {}", pid);
|
||||
}
|
||||
} else {
|
||||
println!("Stop pageserver gracefully");
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
}
|
||||
}
|
||||
|
||||
// wait for pageserver stop
|
||||
let address = connection_address(&self.pg_connection_config);
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(&address);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
if let Err(_e) = stream {
|
||||
println!("Pageserver stopped");
|
||||
return Ok(());
|
||||
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if pageserver checkpoints a lot of data
|
||||
for _ in 0..100 {
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Pageserver stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("Pageserver status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
println!("Pageserver status is: {}", err);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Pageserver still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
println!("Stopping pageserver on {}", address);
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
@@ -169,7 +250,7 @@ impl PageServerNode {
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
|
||||
self.pg_connection_config.connect(NoTls)
|
||||
}
|
||||
|
||||
@@ -182,14 +263,9 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
pub fn check_status(&self) -> Result<()> {
|
||||
let status = self
|
||||
.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
|
||||
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
|
||||
.send()?
|
||||
.status();
|
||||
ensure!(
|
||||
status == StatusCode::OK,
|
||||
format!("got unexpected response status {}", status)
|
||||
);
|
||||
.error_from_body()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -197,7 +273,7 @@ impl PageServerNode {
|
||||
Ok(self
|
||||
.http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
|
||||
.send()?
|
||||
.error_for_status()?
|
||||
.error_from_body()?
|
||||
.json()?)
|
||||
}
|
||||
|
||||
@@ -208,7 +284,7 @@ impl PageServerNode {
|
||||
tenant_id: tenantid,
|
||||
})
|
||||
.send()?
|
||||
.error_for_status()?
|
||||
.error_from_body()?
|
||||
.json()?)
|
||||
}
|
||||
|
||||
@@ -219,7 +295,7 @@ impl PageServerNode {
|
||||
format!("{}/branch/{}", self.http_base_url, tenantid),
|
||||
)
|
||||
.send()?
|
||||
.error_for_status()?
|
||||
.error_from_body()?
|
||||
.json()?)
|
||||
}
|
||||
|
||||
@@ -230,42 +306,38 @@ impl PageServerNode {
|
||||
tenantid: &ZTenantId,
|
||||
) -> Result<BranchInfo> {
|
||||
Ok(self
|
||||
.http_request(Method::POST, format!("{}/{}", self.http_base_url, "branch"))
|
||||
.http_request(Method::POST, format!("{}/branch", self.http_base_url))
|
||||
.json(&BranchCreateRequest {
|
||||
tenant_id: tenantid.to_owned(),
|
||||
name: branch_name.to_owned(),
|
||||
start_point: startpoint.to_owned(),
|
||||
})
|
||||
.send()?
|
||||
.error_for_status()?
|
||||
.error_from_body()?
|
||||
.json()?)
|
||||
}
|
||||
|
||||
// TODO: make this a separate request type and avoid loading all the branches
|
||||
pub fn branch_get_by_name(
|
||||
&self,
|
||||
tenantid: &ZTenantId,
|
||||
branch_name: &str,
|
||||
) -> Result<BranchInfo> {
|
||||
let branch_infos = self.branch_list(tenantid)?;
|
||||
let branch_by_name: Result<HashMap<String, BranchInfo>> = branch_infos
|
||||
.into_iter()
|
||||
.map(|branch_info| Ok((branch_info.name.clone(), branch_info)))
|
||||
.collect();
|
||||
let branch_by_name = branch_by_name?;
|
||||
|
||||
let branch = branch_by_name
|
||||
.get(branch_name)
|
||||
.ok_or_else(|| anyhow!("Branch {} not found", branch_name))?;
|
||||
|
||||
Ok(branch.clone())
|
||||
Ok(self
|
||||
.http_request(
|
||||
Method::GET,
|
||||
format!("{}/branch/{}/{}", self.http_base_url, tenantid, branch_name),
|
||||
)
|
||||
.send()?
|
||||
.error_for_status()?
|
||||
.json()?)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
// TODO Looks like this flag is never set
|
||||
if self.kill_on_exit {
|
||||
let _ = self.stop();
|
||||
let _ = self.stop(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ if [ "$1" = 'pageserver' ]; then
|
||||
pageserver --init -D /data --postgres-distrib /usr/local
|
||||
fi
|
||||
echo "Staring pageserver at 0.0.0.0:6400"
|
||||
pageserver -l 0.0.0.0:6400 -D /data
|
||||
pageserver -l 0.0.0.0:6400 --listen-http 0.0.0.0:9898 -D /data
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
|
||||
@@ -10,5 +10,5 @@
|
||||
- [pageserver/README](/pageserver/README) — pageserver overview.
|
||||
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
|
||||
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
|
||||
- [walkeeper/README](/walkeeper/README.md) — WAL service overview.
|
||||
- [walkeeper/README](/walkeeper/README) — WAL service overview.
|
||||
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
Currently we build two main images:
|
||||
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
|
||||
|
||||
And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
|
||||
|
||||
@@ -30,12 +30,20 @@ writes out the changes from in-memory layers into new layer files[]. This proces
|
||||
is called "checkpointing". The page server only creates layer files for
|
||||
relations that have been modified since the last checkpoint.
|
||||
|
||||
Configuration parameter `checkpoint_distance` defines the distance
|
||||
from current LSN to perform checkpoint of in-memory layers.
|
||||
Default is `DEFAULT_CHECKPOINT_DISTANCE`.
|
||||
Set this parameter to `0` to force checkpoint of every layer.
|
||||
|
||||
Configuration parameter `checkpoint_period` defines the interval between checkpoint iterations.
|
||||
Default is `DEFAULT_CHECKPOINT_PERIOD`.
|
||||
### Compute node
|
||||
|
||||
Stateless Postgres node that stores data in pageserver.
|
||||
|
||||
### Garbage collection
|
||||
|
||||
The process of removing old on-disk layers that are not needed by any timeline anymore.
|
||||
### Fork
|
||||
|
||||
Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map.
|
||||
@@ -43,7 +51,7 @@ Each PostgreSQL fork is considered a separate relish.
|
||||
|
||||
### Layer
|
||||
|
||||
Each layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
|
||||
Each layer corresponds to the specific version of a relish Segment in a range of LSNs.
|
||||
There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
layers are used to ingest incoming WAL, and provide fast access
|
||||
to the recent page versions. On-disk layers are stored as files on disk, and
|
||||
@@ -122,6 +130,20 @@ One repository corresponds to one Tenant.
|
||||
|
||||
How much history do we need to keep around for PITR and read-only nodes?
|
||||
|
||||
### Segment (PostgreSQL)
|
||||
|
||||
NOTE: This is an overloaded term.
|
||||
|
||||
A physical file that stores data for a given relation. File segments are
|
||||
limited in size by a compile-time setting (1 gigabyte by default), so if a
|
||||
relation exceeds that size, it is split into multiple segments.
|
||||
|
||||
### Segment (Layered Repository)
|
||||
|
||||
NOTE: This is an overloaded term.
|
||||
|
||||
Segment is a RELISH_SEG_SIZE slice of relish (identified by a SegmentTag).
|
||||
|
||||
### SLRU
|
||||
|
||||
SLRUs include pg_clog, pg_multixact/members, and
|
||||
|
||||
@@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id
|
||||
|
||||
### Safety
|
||||
|
||||
For now particular tenant can only appear on a particular pageserver. Set of WAL acceptors are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
|
||||
For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
|
||||
|
||||
@@ -4,8 +4,6 @@ version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
bookfile = "^0.3"
|
||||
chrono = "0.4.19"
|
||||
@@ -16,14 +14,10 @@ byteorder = "1.4.3"
|
||||
futures = "0.3.13"
|
||||
hyper = "0.14"
|
||||
lazy_static = "1.4.0"
|
||||
slog-stdlog = "4.1.0"
|
||||
slog-scope = "4.4.0"
|
||||
slog-term = "2.8.0"
|
||||
slog = "2.7.0"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.5.0", features = ["full"] }
|
||||
tokio = { version = "1.11", features = ["process", "macros", "fs", "rt"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
@@ -38,8 +32,19 @@ serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
toml = "0.5"
|
||||
scopeguard = "1.1.0"
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
async-trait = "0.1"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.27"
|
||||
signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
|
||||
#yakv = { path = "../../yakv" }
|
||||
yakv = "0.2.4"
|
||||
lz4_flex = "0.9.0"
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
hex-literal = "0.3"
|
||||
|
||||
@@ -7,8 +7,9 @@ The Page Server has a few different duties:
|
||||
- Replay WAL that's applicable to the chunks that the Page Server maintains
|
||||
- Backup to S3
|
||||
|
||||
|
||||
|
||||
S3 is the main fault-tolerant storage of all data, as there are no Page Server
|
||||
replicas. We use a separate fault-tolerant WAL service to reduce latency. It
|
||||
keeps track of WAL records which are not syncted to S3 yet.
|
||||
|
||||
The Page Server consists of multiple threads that operate on a shared
|
||||
repository of page versions:
|
||||
|
||||
@@ -10,8 +10,10 @@
|
||||
//! This module is responsible for creation of such tarball
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::Result;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use log::*;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
@@ -22,7 +24,7 @@ use crate::relish::*;
|
||||
use crate::repository::Timeline;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use zenith_utils::lsn::{Lsn, RecordLsn};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between various functions
|
||||
@@ -30,7 +32,7 @@ use zenith_utils::lsn::{Lsn, RecordLsn};
|
||||
pub struct Basebackup<'a> {
|
||||
ar: Builder<&'a mut dyn Write>,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
pub lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -46,53 +48,56 @@ impl<'a> Basebackup<'a> {
|
||||
write: &'a mut dyn Write,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
req_lsn: Option<Lsn>,
|
||||
) -> Basebackup<'a> {
|
||||
// current_prev may be zero if we are at the start of timeline branched from old lsn
|
||||
let RecordLsn {
|
||||
last: current_last,
|
||||
prev: current_prev,
|
||||
} = timeline.get_last_record_rlsn();
|
||||
|
||||
// Compute postgres doesn't have any previous WAL files, but the first record that this
|
||||
// postgres is going to write need to have LSN of previous record (xl_prev). So we are
|
||||
// writing prev_lsn to "zenith.signal" file so that postgres can read it during the start.
|
||||
// In some cases we don't know prev_lsn (branch or basebackup @old_lsn) so pass Lsn(0)
|
||||
// instead and embrace the wrong xl_prev in this situations.
|
||||
) -> Result<Basebackup<'a>> {
|
||||
// Compute postgres doesn't have any previous WAL files, but the first
|
||||
// record that it's going to write needs to include the LSN of the
|
||||
// previous record (xl_prev). We include prev_record_lsn in the
|
||||
// "zenith.signal" file, so that postgres can read it during startup.
|
||||
//
|
||||
// We don't keep full history of record boundaries in the page server,
|
||||
// however, only the predecessor of the latest record on each
|
||||
// timeline. So we can only provide prev_record_lsn when you take a
|
||||
// base backup at the end of the timeline, i.e. at last_record_lsn.
|
||||
// Even at the end of the timeline, we sometimes don't have a valid
|
||||
// prev_lsn value; that happens if the timeline was just branched from
|
||||
// an old LSN and it doesn't have any WAL of its own yet. We will set
|
||||
// prev_lsn to Lsn(0) if we cannot provide the correct value.
|
||||
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
|
||||
if req_lsn > current_last {
|
||||
// FIXME: now wait_lsn() is inside of list_nonrels() so we don't have a way
|
||||
// to get it from there. It is better to wait just here.
|
||||
(Lsn(0), req_lsn)
|
||||
} else if req_lsn < current_last {
|
||||
// we don't know prev already. We don't currently use basebackup@old_lsn
|
||||
// but may use it for read only replicas in future
|
||||
(Lsn(0), req_lsn)
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
timeline.wait_lsn(req_lsn)?;
|
||||
|
||||
// If the requested point is the end of the timeline, we can
|
||||
// provide prev_lsn. (get_last_record_rlsn() might return it as
|
||||
// zero, though, if no WAL has been generated on this timeline
|
||||
// yet.)
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
if req_lsn == end_of_timeline.last {
|
||||
(end_of_timeline.prev, req_lsn)
|
||||
} else {
|
||||
// we are exactly at req_lsn and know prev
|
||||
(current_prev, req_lsn)
|
||||
(Lsn(0), req_lsn)
|
||||
}
|
||||
} else {
|
||||
// None in req_lsn means that we are branching from the latest LSN
|
||||
(current_prev, current_last)
|
||||
// Backup was requested at end of the timeline.
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
(end_of_timeline.prev, end_of_timeline.last)
|
||||
};
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={}",
|
||||
backup_prev, backup_lsn
|
||||
backup_lsn, backup_prev
|
||||
);
|
||||
|
||||
Basebackup {
|
||||
Ok(Basebackup {
|
||||
ar: Builder::new(write),
|
||||
timeline,
|
||||
lsn: backup_lsn,
|
||||
prev_record_lsn: backup_prev,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
|
||||
// Create pgdata subdirs structure
|
||||
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
||||
info!("send subdir {:?}", *dir);
|
||||
let header = new_tar_header_dir(*dir)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
}
|
||||
@@ -154,11 +159,9 @@ impl<'a> Basebackup<'a> {
|
||||
let mut slru_buf: Vec<u8> =
|
||||
Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize);
|
||||
for blknum in 0..nblocks {
|
||||
let img = self.timeline.get_page_at_lsn_nowait(
|
||||
RelishTag::Slru { slru, segno },
|
||||
blknum,
|
||||
self.lsn,
|
||||
)?;
|
||||
let img =
|
||||
self.timeline
|
||||
.get_page_at_lsn(RelishTag::Slru { slru, segno }, blknum, self.lsn)?;
|
||||
assert!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
|
||||
slru_buf.extend_from_slice(&img);
|
||||
@@ -177,7 +180,7 @@ impl<'a> Basebackup<'a> {
|
||||
// Along with them also send PG_VERSION for each database.
|
||||
//
|
||||
fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> {
|
||||
let img = self.timeline.get_page_at_lsn_nowait(
|
||||
let img = self.timeline.get_page_at_lsn(
|
||||
RelishTag::FileNodeMap { spcnode, dbnode },
|
||||
0,
|
||||
self.lsn,
|
||||
@@ -219,7 +222,7 @@ impl<'a> Basebackup<'a> {
|
||||
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_page_at_lsn_nowait(RelishTag::TwoPhase { xid }, 0, self.lsn)?;
|
||||
.get_page_at_lsn(RelishTag::TwoPhase { xid }, 0, self.lsn)?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
@@ -237,22 +240,16 @@ impl<'a> Basebackup<'a> {
|
||||
// Also send zenith.signal file with extra bootstrap data.
|
||||
//
|
||||
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
let checkpoint_bytes =
|
||||
self.timeline
|
||||
.get_page_at_lsn_nowait(RelishTag::Checkpoint, 0, self.lsn)?;
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)?;
|
||||
let pg_control_bytes =
|
||||
self.timeline
|
||||
.get_page_at_lsn_nowait(RelishTag::ControlFile, 0, self.lsn)?;
|
||||
.get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)?;
|
||||
let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
// Generate new pg_control and WAL needed for bootstrap
|
||||
let checkpoint_segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let checkpoint_lsn = XLogSegNoOffsetToRecPtr(
|
||||
checkpoint_segno,
|
||||
XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
);
|
||||
// Generate new pg_control needed for bootstrap
|
||||
checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0;
|
||||
|
||||
//reset some fields we don't want to preserve
|
||||
@@ -261,19 +258,24 @@ impl<'a> Basebackup<'a> {
|
||||
checkpoint.oldestActiveXid = 0;
|
||||
|
||||
//save new values in pg_control
|
||||
pg_control.checkPoint = checkpoint_lsn;
|
||||
pg_control.checkPoint = 0;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
pg_control.state = pg_constants::DB_SHUTDOWNED;
|
||||
|
||||
// add zenith.signal file
|
||||
let xl_prev = if self.prev_record_lsn == Lsn(0) {
|
||||
0xBAD0 // magic value to indicate that we don't know prev_lsn
|
||||
let mut zenith_signal = String::new();
|
||||
if self.prev_record_lsn == Lsn(0) {
|
||||
if self.lsn == self.timeline.get_ancestor_lsn() {
|
||||
write!(zenith_signal, "PREV LSN: none")?;
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: invalid")?;
|
||||
}
|
||||
} else {
|
||||
self.prev_record_lsn.0
|
||||
};
|
||||
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
|
||||
}
|
||||
self.ar.append(
|
||||
&new_tar_header("zenith.signal", 8)?,
|
||||
&xl_prev.to_le_bytes()[..],
|
||||
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
|
||||
zenith_signal.as_bytes(),
|
||||
)?;
|
||||
|
||||
//send pg_control
|
||||
@@ -282,14 +284,15 @@ impl<'a> Basebackup<'a> {
|
||||
self.ar.append(&header, &pg_control_bytes[..])?;
|
||||
|
||||
//send wal segment
|
||||
let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
checkpoint_segno,
|
||||
segno,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
);
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
|
||||
let wal_seg = generate_wal_segment(&pg_control);
|
||||
let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
|
||||
assert!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..])?;
|
||||
Ok(())
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
//! Main entry point for the dump_layerfile executable
|
||||
//!
|
||||
//! A handy tool for debugging, that's all.
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
use pageserver::layered_repository::dump_layerfile_from_path;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith dump_layerfile utility")
|
||||
.about("Dump contents of one layer file, for debugging")
|
||||
.arg(
|
||||
Arg::with_name("path")
|
||||
.help("Path to file to dump")
|
||||
.required(true)
|
||||
.index(1),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let path = PathBuf::from(arg_matches.value_of("path").unwrap());
|
||||
|
||||
dump_layerfile_from_path(&path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -2,44 +2,81 @@
|
||||
// Main entry point for the Page Server executable
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
env,
|
||||
net::TcpListener,
|
||||
path::{Path, PathBuf},
|
||||
process::exit,
|
||||
str::FromStr,
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
use zenith_utils::{auth::JwtAuth, postgres_backend::AuthType};
|
||||
use tracing::*;
|
||||
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use signal_hook::consts::signal::*;
|
||||
use signal_hook::consts::TERM_SIGNALS;
|
||||
use signal_hook::flag;
|
||||
use signal_hook::iterator::exfiltrator::WithOrigin;
|
||||
use signal_hook::iterator::SignalsInfo;
|
||||
use std::process::exit;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{ensure, Result};
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use pageserver::{branches, http, logger, page_service, tenant_mgr, PageServerConf};
|
||||
use pageserver::{
|
||||
branches, defaults::*, http, page_service, relish_storage, tenant_mgr, PageServerConf,
|
||||
RelishStorageConfig, RelishStorageKind, S3Config, LOG_FILE_NAME,
|
||||
};
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::postgres_backend;
|
||||
|
||||
const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000";
|
||||
const DEFAULT_HTTP_ENDPOINT_ADDR: &str = "127.0.0.1:9898";
|
||||
|
||||
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
|
||||
|
||||
const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
use const_format::formatcp;
|
||||
|
||||
/// String arguments that can be declared via CLI or config file
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
struct CfgFileParams {
|
||||
listen_addr: Option<String>,
|
||||
http_endpoint_addr: Option<String>,
|
||||
listen_pg_addr: Option<String>,
|
||||
listen_http_addr: Option<String>,
|
||||
checkpoint_distance: Option<String>,
|
||||
checkpoint_period: Option<String>,
|
||||
upload_distance: Option<String>,
|
||||
upload_period: Option<String>,
|
||||
reconstruct_threshold: Option<String>,
|
||||
gc_horizon: Option<String>,
|
||||
gc_period: Option<String>,
|
||||
pg_distrib_dir: Option<String>,
|
||||
auth_validation_public_key_path: Option<String>,
|
||||
auth_type: Option<String>,
|
||||
relish_storage_max_concurrent_sync: Option<String>,
|
||||
/////////////////////////////////
|
||||
//// Don't put `Option<String>` and other "simple" values below.
|
||||
////
|
||||
/// `Option<RelishStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
|
||||
/// Values in TOML cannot be defined after tables (other tables can),
|
||||
/// and [`toml`] crate serializes all fields in the order of their appearance.
|
||||
////////////////////////////////
|
||||
relish_storage: Option<RelishStorage>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
// Without this attribute, enums with values won't be serialized by the `toml` library (but can be deserialized nonetheless!).
|
||||
// See https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for the examples
|
||||
#[serde(untagged)]
|
||||
enum RelishStorage {
|
||||
Local {
|
||||
local_path: String,
|
||||
},
|
||||
AwsS3 {
|
||||
bucket_name: String,
|
||||
bucket_region: String,
|
||||
#[serde(skip_serializing)]
|
||||
access_key_id: Option<String>,
|
||||
#[serde(skip_serializing)]
|
||||
secret_access_key: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
impl CfgFileParams {
|
||||
@@ -49,14 +86,36 @@ impl CfgFileParams {
|
||||
arg_matches.value_of(arg_name).map(str::to_owned)
|
||||
};
|
||||
|
||||
let relish_storage = if let Some(local_path) = get_arg("relish-storage-local-path") {
|
||||
Some(RelishStorage::Local { local_path })
|
||||
} else if let Some((bucket_name, bucket_region)) =
|
||||
get_arg("relish-storage-s3-bucket").zip(get_arg("relish-storage-region"))
|
||||
{
|
||||
Some(RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id: get_arg("relish-storage-access-key"),
|
||||
secret_access_key: get_arg("relish-storage-secret-access-key"),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Self {
|
||||
listen_addr: get_arg("listen"),
|
||||
http_endpoint_addr: get_arg("http_endpoint"),
|
||||
listen_pg_addr: get_arg("listen-pg"),
|
||||
listen_http_addr: get_arg("listen-http"),
|
||||
checkpoint_distance: get_arg("checkpoint_distance"),
|
||||
checkpoint_period: get_arg("checkpoint_period"),
|
||||
upload_distance: get_arg("upload_distance"),
|
||||
upload_period: get_arg("upload_period"),
|
||||
reconstruct_threshold: get_arg("reconstruct_threshold"),
|
||||
gc_horizon: get_arg("gc_horizon"),
|
||||
gc_period: get_arg("gc_period"),
|
||||
pg_distrib_dir: get_arg("postgres-distrib"),
|
||||
auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
|
||||
auth_type: get_arg("auth-type"),
|
||||
relish_storage,
|
||||
relish_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,8 +123,13 @@ impl CfgFileParams {
|
||||
fn or(self, other: CfgFileParams) -> Self {
|
||||
// TODO cleaner way to do this
|
||||
Self {
|
||||
listen_addr: self.listen_addr.or(other.listen_addr),
|
||||
http_endpoint_addr: self.http_endpoint_addr.or(other.http_endpoint_addr),
|
||||
listen_pg_addr: self.listen_pg_addr.or(other.listen_pg_addr),
|
||||
listen_http_addr: self.listen_http_addr.or(other.listen_http_addr),
|
||||
checkpoint_distance: self.checkpoint_distance.or(other.checkpoint_distance),
|
||||
checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
|
||||
upload_distance: self.upload_distance.or(other.upload_distance),
|
||||
upload_period: self.upload_period.or(other.upload_period),
|
||||
reconstruct_threshold: self.reconstruct_threshold.or(other.reconstruct_threshold),
|
||||
gc_horizon: self.gc_horizon.or(other.gc_horizon),
|
||||
gc_period: self.gc_period.or(other.gc_period),
|
||||
pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
|
||||
@@ -73,6 +137,10 @@ impl CfgFileParams {
|
||||
.auth_validation_public_key_path
|
||||
.or(other.auth_validation_public_key_path),
|
||||
auth_type: self.auth_type.or(other.auth_type),
|
||||
relish_storage: self.relish_storage.or(other.relish_storage),
|
||||
relish_storage_max_concurrent_sync: self
|
||||
.relish_storage_max_concurrent_sync
|
||||
.or(other.relish_storage_max_concurrent_sync),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,14 +148,37 @@ impl CfgFileParams {
|
||||
fn try_into_config(&self) -> Result<PageServerConf> {
|
||||
let workdir = PathBuf::from(".");
|
||||
|
||||
let listen_addr = match self.listen_addr.as_ref() {
|
||||
let listen_pg_addr = match self.listen_pg_addr.as_ref() {
|
||||
Some(addr) => addr.clone(),
|
||||
None => DEFAULT_LISTEN_ADDR.to_owned(),
|
||||
None => DEFAULT_PG_LISTEN_ADDR.to_owned(),
|
||||
};
|
||||
|
||||
let http_endpoint_addr = match self.http_endpoint_addr.as_ref() {
|
||||
let listen_http_addr = match self.listen_http_addr.as_ref() {
|
||||
Some(addr) => addr.clone(),
|
||||
None => DEFAULT_HTTP_ENDPOINT_ADDR.to_owned(),
|
||||
None => DEFAULT_HTTP_LISTEN_ADDR.to_owned(),
|
||||
};
|
||||
|
||||
let checkpoint_distance: u64 = match self.checkpoint_distance.as_ref() {
|
||||
Some(checkpoint_distance_str) => checkpoint_distance_str.parse()?,
|
||||
None => DEFAULT_CHECKPOINT_DISTANCE,
|
||||
};
|
||||
let checkpoint_period = match self.checkpoint_period.as_ref() {
|
||||
Some(checkpoint_period_str) => humantime::parse_duration(checkpoint_period_str)?,
|
||||
None => DEFAULT_CHECKPOINT_PERIOD,
|
||||
};
|
||||
|
||||
let upload_distance: u64 = match self.upload_distance.as_ref() {
|
||||
Some(upload_distance_str) => upload_distance_str.parse()?,
|
||||
None => DEFAULT_UPLOAD_DISTANCE,
|
||||
};
|
||||
let upload_period = match self.upload_period.as_ref() {
|
||||
Some(upload_period_str) => humantime::parse_duration(upload_period_str)?,
|
||||
None => DEFAULT_UPLOAD_PERIOD,
|
||||
};
|
||||
|
||||
let reconstruct_threshold: u64 = match self.reconstruct_threshold.as_ref() {
|
||||
Some(reconstruct_threshold_str) => reconstruct_threshold_str.parse()?,
|
||||
None => DEFAULT_RECONSTRUCT_THRESHOLD,
|
||||
};
|
||||
|
||||
let gc_horizon: u64 = match self.gc_horizon.as_ref() {
|
||||
@@ -117,7 +208,7 @@ impl CfgFileParams {
|
||||
})?;
|
||||
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
|
||||
if auth_type == AuthType::ZenithJWT {
|
||||
@@ -132,11 +223,45 @@ impl CfgFileParams {
|
||||
);
|
||||
}
|
||||
|
||||
let max_concurrent_sync = match self.relish_storage_max_concurrent_sync.as_deref() {
|
||||
Some(relish_storage_max_concurrent_sync) => {
|
||||
relish_storage_max_concurrent_sync.parse()?
|
||||
}
|
||||
None => DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
|
||||
};
|
||||
let relish_storage_config = self.relish_storage.as_ref().map(|storage_params| {
|
||||
let storage = match storage_params.clone() {
|
||||
RelishStorage::Local { local_path } => {
|
||||
RelishStorageKind::LocalFs(PathBuf::from(local_path))
|
||||
}
|
||||
RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
} => RelishStorageKind::AwsS3(S3Config {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
}),
|
||||
};
|
||||
RelishStorageConfig {
|
||||
max_concurrent_sync,
|
||||
storage,
|
||||
}
|
||||
});
|
||||
|
||||
Ok(PageServerConf {
|
||||
daemonize: false,
|
||||
|
||||
listen_addr,
|
||||
http_endpoint_addr,
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
checkpoint_distance,
|
||||
checkpoint_period,
|
||||
upload_distance,
|
||||
upload_period,
|
||||
reconstruct_threshold,
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
|
||||
@@ -148,19 +273,29 @@ impl CfgFileParams {
|
||||
|
||||
auth_validation_public_key_path,
|
||||
auth_type,
|
||||
relish_storage_config,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("pageserver");
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.arg(
|
||||
Arg::with_name("listen")
|
||||
Arg::with_name("listen-pg")
|
||||
.short("l")
|
||||
.long("listen")
|
||||
.long("listen-pg")
|
||||
.alias("listen") // keep some compatibility
|
||||
.takes_value(true)
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"),
|
||||
.help(formatcp!("listen for incoming page requests on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen-http")
|
||||
.long("listen-http")
|
||||
.alias("http_endpoint") // keep some compatibility
|
||||
.takes_value(true)
|
||||
.help(formatcp!("http endpoint address for metrics and management API calls on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("daemonize")
|
||||
@@ -175,6 +310,36 @@ fn main() -> Result<()> {
|
||||
.takes_value(false)
|
||||
.help("Initialize pageserver repo"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("checkpoint_distance")
|
||||
.long("checkpoint_distance")
|
||||
.takes_value(true)
|
||||
.help("Distance from current LSN to perform checkpoint of in-memory layers"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("checkpoint_period")
|
||||
.long("checkpoint_period")
|
||||
.takes_value(true)
|
||||
.help("Interval between checkpoint iterations"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("checkpoint_distance")
|
||||
.long("checkpoint_distance")
|
||||
.takes_value(true)
|
||||
.help("Distance from current LSN to perform checkpoint of in-memory layers"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("upload_period")
|
||||
.long("upload_period")
|
||||
.takes_value(true)
|
||||
.help("Interval between upload iterations"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("reconstruct_threshold")
|
||||
.long("reconstruct_threshold")
|
||||
.takes_value(true)
|
||||
.help("Minimal size of deltas after which page reconstruction (materialization) can be performed"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gc_horizon")
|
||||
.long("gc_horizon")
|
||||
@@ -219,10 +384,56 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("relish-storage-local-path")
|
||||
.long("relish-storage-local-path")
|
||||
.takes_value(true)
|
||||
.help("Path to the local directory, to be used as an external relish storage")
|
||||
.conflicts_with_all(&[
|
||||
"relish-storage-s3-bucket",
|
||||
"relish-storage-region",
|
||||
"relish-storage-access-key",
|
||||
"relish-storage-secret-access-key",
|
||||
]),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("relish-storage-s3-bucket")
|
||||
.long("relish-storage-s3-bucket")
|
||||
.takes_value(true)
|
||||
.help("Name of the AWS S3 bucket to use an external relish storage")
|
||||
.requires("relish-storage-region"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("relish-storage-region")
|
||||
.long("relish-storage-region")
|
||||
.takes_value(true)
|
||||
.help("Region of the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("relish-storage-access-key")
|
||||
.long("relish-storage-access-key")
|
||||
.takes_value(true)
|
||||
.help("Credentials to access the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("relish-storage-secret-access-key")
|
||||
.long("relish-storage-secret-access-key")
|
||||
.takes_value(true)
|
||||
.help("Credentials to access the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("relish-storage-max-concurrent-sync")
|
||||
.long("relish-storage-max-concurrent-sync")
|
||||
.takes_value(true)
|
||||
.help("Maximum allowed concurrent synchronisations with storage"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
let cfg_file_path = workdir.canonicalize()?.join("pageserver.toml");
|
||||
let cfg_file_path = workdir
|
||||
.canonicalize()
|
||||
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?
|
||||
.join("pageserver.toml");
|
||||
|
||||
let args_params = CfgFileParams::from_args(&arg_matches);
|
||||
|
||||
@@ -234,22 +445,37 @@ fn main() -> Result<()> {
|
||||
args_params
|
||||
} else {
|
||||
// Supplement the CLI arguments with the config file
|
||||
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)?;
|
||||
let file_params: CfgFileParams = toml::from_str(&cfg_file_contents)?;
|
||||
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)
|
||||
.with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?;
|
||||
let file_params: CfgFileParams = toml::from_str(&cfg_file_contents).with_context(|| {
|
||||
format!(
|
||||
"Failed to read '{}' as pageserver config",
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
args_params.or(file_params)
|
||||
};
|
||||
|
||||
// Set CWD to workdir for non-daemon modes
|
||||
env::set_current_dir(&workdir)?;
|
||||
env::set_current_dir(&workdir).with_context(|| {
|
||||
format!(
|
||||
"Failed to set application's current dir to '{}'",
|
||||
workdir.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Ensure the config is valid, even if just init-ing
|
||||
let mut conf = params.try_into_config()?;
|
||||
let mut conf = params.try_into_config().with_context(|| {
|
||||
format!(
|
||||
"Pageserver config at '{}' is not valid",
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
conf.daemonize = arg_matches.is_present("daemonize");
|
||||
|
||||
if init && conf.daemonize {
|
||||
eprintln!("--daemonize cannot be used with --init");
|
||||
exit(1);
|
||||
bail!("--daemonize cannot be used with --init")
|
||||
}
|
||||
|
||||
// The configuration is all set up now. Turn it into a 'static
|
||||
@@ -259,28 +485,53 @@ fn main() -> Result<()> {
|
||||
|
||||
// Create repo and exit if init was requested
|
||||
if init {
|
||||
branches::init_pageserver(conf, create_tenant)?;
|
||||
branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
|
||||
// write the config file
|
||||
let cfg_file_contents = toml::to_string_pretty(¶ms)?;
|
||||
let cfg_file_contents = toml::to_string_pretty(¶ms)
|
||||
.context("Failed to create pageserver config contents for initialisation")?;
|
||||
// TODO support enable-auth flag
|
||||
std::fs::write(&cfg_file_path, cfg_file_contents)?;
|
||||
|
||||
return Ok(());
|
||||
std::fs::write(&cfg_file_path, cfg_file_contents).with_context(|| {
|
||||
format!(
|
||||
"Failed to initialize pageserver config at '{}'",
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
} else {
|
||||
start_pageserver(conf).context("Failed to start pageserver")
|
||||
}
|
||||
|
||||
start_pageserver(conf)
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
// Initialize logger
|
||||
let (_scope_guard, log_file) = logger::init_logging(conf, "pageserver.log")?;
|
||||
let _log_guard = slog_stdlog::init()?;
|
||||
let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;
|
||||
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
info!("standard logging redirected to slog");
|
||||
let term_now = Arc::new(AtomicBool::new(false));
|
||||
for sig in TERM_SIGNALS {
|
||||
// When terminated by a second term signal, exit with exit code 1.
|
||||
// This will do nothing the first time (because term_now is false).
|
||||
flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
|
||||
// But this will "arm" the above for the second time, by setting it to true.
|
||||
// The order of registering these is important, if you put this one first, it will
|
||||
// first arm and then terminate ‒ all in the first round.
|
||||
flag::register(*sig, Arc::clone(&term_now))?;
|
||||
}
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
// bind sockets before daemonizing so we report errors early and do not return until we are listening
|
||||
info!(
|
||||
"Starting pageserver http handler on {}",
|
||||
conf.listen_http_addr
|
||||
);
|
||||
let http_listener = TcpListener::bind(conf.listen_http_addr.clone())?;
|
||||
|
||||
info!(
|
||||
"Starting pageserver pg protocol handler on {}",
|
||||
conf.listen_pg_addr
|
||||
);
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_pg_addr.clone())?;
|
||||
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
@@ -297,10 +548,20 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
|
||||
match daemonize.start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(e) => error!("Error, {}", e),
|
||||
Err(err) => error!(%err, "could not daemonize"),
|
||||
}
|
||||
}
|
||||
|
||||
// keep join handles for spawned threads
|
||||
// don't spawn threads before daemonizing
|
||||
let mut join_handles = Vec::new();
|
||||
|
||||
if let Some(handle) = relish_storage::run_storage_sync_thread(conf)? {
|
||||
join_handles.push(handle);
|
||||
}
|
||||
// Initialize tenant manager.
|
||||
tenant_mgr::init(conf);
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
AuthType::Trust | AuthType::MD5 => None,
|
||||
@@ -313,21 +574,16 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
info!("Using auth: {:#?}", conf.auth_type);
|
||||
|
||||
// Spawn a new thread for the http endpoint
|
||||
// bind before launching separate thread so the error reported before startup exits
|
||||
let cloned = auth.clone();
|
||||
thread::Builder::new()
|
||||
let http_endpoint_thread = thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(move || {
|
||||
let router = http::make_router(conf, cloned);
|
||||
endpoint::serve_thread_main(router, conf.http_endpoint_addr.clone())
|
||||
endpoint::serve_thread_main(router, http_listener)
|
||||
})?;
|
||||
|
||||
// Check that we can bind to address before starting threads to simplify shutdown
|
||||
// sequence if port is occupied.
|
||||
info!("Starting pageserver on {}", conf.listen_addr);
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_addr.clone())?;
|
||||
|
||||
// Initialize tenant manager.
|
||||
tenant_mgr::init(conf);
|
||||
join_handles.push(http_endpoint_thread);
|
||||
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
// for each connection.
|
||||
@@ -337,9 +593,185 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
|
||||
})?;
|
||||
|
||||
page_service_thread
|
||||
.join()
|
||||
.expect("Page service thread has panicked")?;
|
||||
for info in SignalsInfo::<WithOrigin>::new(TERM_SIGNALS)?.into_iter() {
|
||||
match info.signal {
|
||||
SIGQUIT => {
|
||||
info!("Got SIGQUIT. Terminate pageserver in immediate shutdown mode");
|
||||
exit(111);
|
||||
}
|
||||
SIGINT | SIGTERM => {
|
||||
info!("Got SIGINT/SIGTERM. Terminate gracefully in fast shutdown mode");
|
||||
// Terminate postgres backends
|
||||
postgres_backend::set_pgbackend_shutdown_requested();
|
||||
// Stop all tenants and flush their data
|
||||
tenant_mgr::shutdown_all_tenants()?;
|
||||
// Wait for pageservice thread to complete the job
|
||||
page_service_thread
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error");
|
||||
|
||||
// Shut down http router
|
||||
endpoint::shutdown();
|
||||
|
||||
// Wait for all threads
|
||||
for handle in join_handles.into_iter() {
|
||||
handle
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error");
|
||||
}
|
||||
info!("Pageserver shut down successfully completed");
|
||||
exit(0);
|
||||
}
|
||||
unknown_signal => {
|
||||
debug!("Unknown signal {}", unknown_signal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn page_server_conf_toml_serde() {
|
||||
let params = CfgFileParams {
|
||||
listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
|
||||
listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
|
||||
checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
|
||||
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
|
||||
upload_distance: Some("upload_distance_VALUE".to_string()),
|
||||
upload_period: Some("upload_period_VALUE".to_string()),
|
||||
reconstruct_threshold: Some("reconstruct_threshold_VALUE".to_string()),
|
||||
gc_horizon: Some("gc_horizon_VALUE".to_string()),
|
||||
gc_period: Some("gc_period_VALUE".to_string()),
|
||||
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
|
||||
auth_validation_public_key_path: Some(
|
||||
"auth_validation_public_key_path_VALUE".to_string(),
|
||||
),
|
||||
auth_type: Some("auth_type_VALUE".to_string()),
|
||||
relish_storage: Some(RelishStorage::Local {
|
||||
local_path: "relish_storage_local_VALUE".to_string(),
|
||||
}),
|
||||
relish_storage_max_concurrent_sync: Some(
|
||||
"relish_storage_max_concurrent_sync_VALUE".to_string(),
|
||||
),
|
||||
};
|
||||
|
||||
let toml_string = toml::to_string(¶ms).expect("Failed to serialize correct config");
|
||||
let toml_pretty_string =
|
||||
toml::to_string_pretty(¶ms).expect("Failed to serialize correct config");
|
||||
assert_eq!(
|
||||
r#"listen_pg_addr = 'listen_pg_addr_VALUE'
|
||||
listen_http_addr = 'listen_http_addr_VALUE'
|
||||
checkpoint_distance = 'checkpoint_distance_VALUE'
|
||||
checkpoint_period = 'checkpoint_period_VALUE'
|
||||
upload_distance = 'upload_distance_VALUE'
|
||||
upload_period = 'upload_period_VALUE'
|
||||
reconstruct_threshold = 'reconstruct_threshold_VALUE'
|
||||
gc_horizon = 'gc_horizon_VALUE'
|
||||
gc_period = 'gc_period_VALUE'
|
||||
pg_distrib_dir = 'pg_distrib_dir_VALUE'
|
||||
auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
|
||||
auth_type = 'auth_type_VALUE'
|
||||
relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
|
||||
|
||||
[relish_storage]
|
||||
local_path = 'relish_storage_local_VALUE'
|
||||
"#,
|
||||
toml_pretty_string
|
||||
);
|
||||
|
||||
let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
|
||||
.expect("Failed to deserialize the serialization result of the config");
|
||||
let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
|
||||
.expect("Failed to deserialize the prettified serialization result of the config");
|
||||
assert!(
|
||||
params_from_serialized == params,
|
||||
"Expected the same config in the end of config -> serialize -> deserialize chain"
|
||||
);
|
||||
assert!(
|
||||
params_from_serialized_pretty == params,
|
||||
"Expected the same config in the end of config -> serialize pretty -> deserialize chain"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn credentials_omitted_during_serialization() {
|
||||
let params = CfgFileParams {
|
||||
listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
|
||||
listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
|
||||
checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
|
||||
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
|
||||
upload_distance: Some("upload_distance_VALUE".to_string()),
|
||||
upload_period: Some("upload_period_VALUE".to_string()),
|
||||
reconstruct_threshold: Some("reconstruct_threshold_VALUE".to_string()),
|
||||
gc_horizon: Some("gc_horizon_VALUE".to_string()),
|
||||
gc_period: Some("gc_period_VALUE".to_string()),
|
||||
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
|
||||
auth_validation_public_key_path: Some(
|
||||
"auth_validation_public_key_path_VALUE".to_string(),
|
||||
),
|
||||
auth_type: Some("auth_type_VALUE".to_string()),
|
||||
relish_storage: Some(RelishStorage::AwsS3 {
|
||||
bucket_name: "bucket_name_VALUE".to_string(),
|
||||
bucket_region: "bucket_region_VALUE".to_string(),
|
||||
access_key_id: Some("access_key_id_VALUE".to_string()),
|
||||
secret_access_key: Some("secret_access_key_VALUE".to_string()),
|
||||
}),
|
||||
relish_storage_max_concurrent_sync: Some(
|
||||
"relish_storage_max_concurrent_sync_VALUE".to_string(),
|
||||
),
|
||||
};
|
||||
|
||||
let toml_string = toml::to_string(¶ms).expect("Failed to serialize correct config");
|
||||
let toml_pretty_string =
|
||||
toml::to_string_pretty(¶ms).expect("Failed to serialize correct config");
|
||||
assert_eq!(
|
||||
r#"listen_pg_addr = 'listen_pg_addr_VALUE'
|
||||
listen_http_addr = 'listen_http_addr_VALUE'
|
||||
checkpoint_distance = 'checkpoint_distance_VALUE'
|
||||
checkpoint_period = 'checkpoint_period_VALUE'
|
||||
upload_distance = 'upload_distance_VALUE'
|
||||
upload_period = 'upload_period_VALUE'
|
||||
reconstruct_threshold = 'reconstruct_threshold_VALUE'
|
||||
gc_horizon = 'gc_horizon_VALUE'
|
||||
gc_period = 'gc_period_VALUE'
|
||||
pg_distrib_dir = 'pg_distrib_dir_VALUE'
|
||||
auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
|
||||
auth_type = 'auth_type_VALUE'
|
||||
relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
|
||||
|
||||
[relish_storage]
|
||||
bucket_name = 'bucket_name_VALUE'
|
||||
bucket_region = 'bucket_region_VALUE'
|
||||
"#,
|
||||
toml_pretty_string
|
||||
);
|
||||
|
||||
let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
|
||||
.expect("Failed to deserialize the serialization result of the config");
|
||||
let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
|
||||
.expect("Failed to deserialize the prettified serialization result of the config");
|
||||
|
||||
let mut expected_params = params;
|
||||
expected_params.relish_storage = Some(RelishStorage::AwsS3 {
|
||||
bucket_name: "bucket_name_VALUE".to_string(),
|
||||
bucket_region: "bucket_region_VALUE".to_string(),
|
||||
access_key_id: None,
|
||||
secret_access_key: None,
|
||||
});
|
||||
assert!(
|
||||
params_from_serialized == expected_params,
|
||||
"Expected the config without credentials in the end of a 'config -> serialize -> deserialize' chain"
|
||||
);
|
||||
assert!(
|
||||
params_from_serialized_pretty == expected_params,
|
||||
"Expected the config without credentials in the end of a 'config -> serialize pretty -> deserialize' chain"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,25 +14,81 @@ use std::{
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
};
|
||||
use tracing::*;
|
||||
|
||||
use zenith_utils::crashsafe_dir;
|
||||
use zenith_utils::logging;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use log::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::logger;
|
||||
use crate::restore_local_repo;
|
||||
use crate::tenant_mgr;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::{repository::Repository, PageServerConf};
|
||||
use crate::{restore_local_repo, LOG_FILE_NAME};
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct BranchInfo {
|
||||
pub name: String,
|
||||
#[serde(with = "hex")]
|
||||
pub timeline_id: ZTimelineId,
|
||||
pub latest_valid_lsn: Option<Lsn>,
|
||||
pub latest_valid_lsn: Lsn,
|
||||
pub ancestor_id: Option<String>,
|
||||
pub ancestor_lsn: Option<String>,
|
||||
pub current_logical_size: usize,
|
||||
pub current_logical_size_non_incremental: usize,
|
||||
}
|
||||
|
||||
impl BranchInfo {
|
||||
pub fn from_path<T: AsRef<Path>>(
|
||||
path: T,
|
||||
conf: &PageServerConf,
|
||||
tenantid: &ZTenantId,
|
||||
repo: &Arc<dyn Repository>,
|
||||
) -> Result<Self> {
|
||||
let name = path
|
||||
.as_ref()
|
||||
.file_name()
|
||||
.unwrap()
|
||||
.to_str()
|
||||
.unwrap()
|
||||
.to_string();
|
||||
let timeline_id = std::fs::read_to_string(path)?.parse::<ZTimelineId>()?;
|
||||
|
||||
let timeline = repo.get_timeline(timeline_id)?;
|
||||
|
||||
let ancestor_path = conf.ancestor_path(&timeline_id, tenantid);
|
||||
let mut ancestor_id: Option<String> = None;
|
||||
let mut ancestor_lsn: Option<String> = None;
|
||||
|
||||
if ancestor_path.exists() {
|
||||
let ancestor = std::fs::read_to_string(ancestor_path)?;
|
||||
let mut strings = ancestor.split('@');
|
||||
|
||||
ancestor_id = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
ancestor_lsn = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(BranchInfo {
|
||||
name,
|
||||
timeline_id,
|
||||
latest_valid_lsn: timeline.get_last_record_lsn(),
|
||||
ancestor_id,
|
||||
ancestor_lsn,
|
||||
current_logical_size: timeline.get_current_logical_size(),
|
||||
current_logical_size_non_incremental: timeline
|
||||
.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -43,8 +99,8 @@ pub struct PointInTime {
|
||||
|
||||
pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
|
||||
// Initialize logger
|
||||
let (_scope_guard, _log_file) = logger::init_logging(conf, "pageserver.log")?;
|
||||
let _log_guard = slog_stdlog::init()?;
|
||||
// use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
|
||||
let _log_file = logging::init(LOG_FILE_NAME, true)?;
|
||||
|
||||
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
|
||||
// process during repository initialization.
|
||||
@@ -63,7 +119,7 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
|
||||
println!("initializing tenantid {}", tenantid);
|
||||
create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?;
|
||||
}
|
||||
fs::create_dir_all(conf.tenants_path())?;
|
||||
crashsafe_dir::create_dir_all(conf.tenants_path())?;
|
||||
|
||||
println!("pageserver init succeeded");
|
||||
Ok(())
|
||||
@@ -80,30 +136,28 @@ pub fn create_repo(
|
||||
}
|
||||
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
fs::create_dir_all(&repo_dir)
|
||||
crashsafe_dir::create_dir_all(&repo_dir)
|
||||
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
|
||||
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
info!("standard logging redirected to slog");
|
||||
|
||||
fs::create_dir(conf.timelines_path(&tenantid))?;
|
||||
fs::create_dir_all(conf.branches_path(&tenantid))?;
|
||||
fs::create_dir_all(conf.tags_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?;
|
||||
|
||||
info!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
let tli = create_timeline(conf, None, &tenantid)?;
|
||||
|
||||
let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
|
||||
let repo = Arc::new(crate::buffered_repository::BufferedRepository::new(
|
||||
conf,
|
||||
wal_redo_manager,
|
||||
tenantid,
|
||||
false,
|
||||
));
|
||||
|
||||
// Load data into pageserver
|
||||
// TODO To implement zenith import we need to
|
||||
// move data loading out of create_repo()
|
||||
bootstrap_timeline(conf, tenantid, tli, &*repo)?;
|
||||
bootstrap_timeline(conf, tenantid, tli, repo.as_ref())?;
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
@@ -118,17 +172,20 @@ fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
|
||||
Ok(Lsn(lsn))
|
||||
}
|
||||
|
||||
// Create the cluster temporarily in a initdbpath directory inside the repository
|
||||
// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
// to get bootstrap data for timeline initialization.
|
||||
//
|
||||
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
info!("running initdb... ");
|
||||
info!("running initdb in {}... ", initdbpath.display());
|
||||
|
||||
let initdb_path = conf.pg_bin_dir().join("initdb");
|
||||
let initdb_output = Command::new(initdb_path)
|
||||
.args(&["-D", initdbpath.to_str().unwrap()])
|
||||
.args(&["-U", &conf.superuser])
|
||||
.arg("--no-instructions")
|
||||
// This is only used for a temporary installation that is deleted shortly after,
|
||||
// so no need to fsync it
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
@@ -141,7 +198,6 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
String::from_utf8_lossy(&initdb_output.stderr)
|
||||
);
|
||||
}
|
||||
info!("initdb succeeded");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -156,6 +212,8 @@ fn bootstrap_timeline(
|
||||
tli: ZTimelineId,
|
||||
repo: &dyn Repository,
|
||||
) -> Result<()> {
|
||||
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
|
||||
|
||||
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
|
||||
|
||||
// Init temporarily repo to get bootstrap data
|
||||
@@ -164,13 +222,15 @@ fn bootstrap_timeline(
|
||||
|
||||
let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
|
||||
info!("bootstrap_timeline {:?} at lsn {}", pgdata_path, lsn);
|
||||
|
||||
let timeline = repo.create_empty_timeline(tli, lsn)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
|
||||
|
||||
let wal_dir = pgdata_path.join("pg_wal");
|
||||
restore_local_repo::import_timeline_wal(&wal_dir, &*timeline, timeline.get_last_record_lsn())?;
|
||||
// Import the contents of the data directory at the initial checkpoint
|
||||
// LSN, and any WAL after that.
|
||||
let timeline = repo.create_empty_timeline(tli)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(
|
||||
&pgdata_path,
|
||||
timeline.writer().as_ref(),
|
||||
lsn,
|
||||
)?;
|
||||
timeline.checkpoint()?;
|
||||
|
||||
println!(
|
||||
"created initial timeline {} timeline.lsn {}",
|
||||
@@ -201,7 +261,7 @@ pub(crate) fn get_tenants(conf: &PageServerConf) -> Result<Vec<String>> {
|
||||
}
|
||||
|
||||
pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
|
||||
|
||||
// Each branch has a corresponding record (text file) in the refs/branches
|
||||
// with timeline_id.
|
||||
@@ -210,43 +270,7 @@ pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Resul
|
||||
std::fs::read_dir(&branches_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res?;
|
||||
let name = dir_entry.file_name().to_str().unwrap().to_string();
|
||||
let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
|
||||
|
||||
let latest_valid_lsn = repo
|
||||
.get_timeline(timeline_id)
|
||||
.map(|timeline| timeline.get_last_record_lsn())
|
||||
.ok();
|
||||
|
||||
let ancestor_path = conf.ancestor_path(&timeline_id, tenantid);
|
||||
let mut ancestor_id: Option<String> = None;
|
||||
let mut ancestor_lsn: Option<String> = None;
|
||||
|
||||
if ancestor_path.exists() {
|
||||
let ancestor = std::fs::read_to_string(ancestor_path)?;
|
||||
let mut strings = ancestor.split('@');
|
||||
|
||||
ancestor_id = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
ancestor_lsn = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(BranchInfo {
|
||||
name,
|
||||
timeline_id,
|
||||
latest_valid_lsn,
|
||||
ancestor_id,
|
||||
ancestor_lsn,
|
||||
})
|
||||
BranchInfo::from_path(dir_entry.path(), conf, tenantid, &repo)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
@@ -257,7 +281,7 @@ pub(crate) fn create_branch(
|
||||
startpoint_str: &str,
|
||||
tenantid: &ZTenantId,
|
||||
) -> Result<BranchInfo> {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
|
||||
|
||||
if conf.branch_path(branchname, tenantid).exists() {
|
||||
anyhow::bail!("branch {} already exists", branchname);
|
||||
@@ -270,6 +294,14 @@ pub(crate) fn create_branch(
|
||||
let end_of_wal = timeline.get_last_record_lsn();
|
||||
info!("branching at end of WAL: {}", end_of_wal);
|
||||
startpoint.lsn = end_of_wal;
|
||||
} else {
|
||||
// Wait for the WAL to arrive and be processed on the parent branch up
|
||||
// to the requested branch point. The repository code itself doesn't
|
||||
// require it, but if we start to receive WAL on the new timeline,
|
||||
// decoding the new WAL might need to look up previous pages, relation
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
timeline.wait_lsn(startpoint.lsn)?;
|
||||
}
|
||||
startpoint.lsn = startpoint.lsn.align();
|
||||
if timeline.get_start_lsn() > startpoint.lsn {
|
||||
@@ -296,9 +328,11 @@ pub(crate) fn create_branch(
|
||||
Ok(BranchInfo {
|
||||
name: branchname.to_string(),
|
||||
timeline_id: newtli,
|
||||
latest_valid_lsn: Some(startpoint.lsn),
|
||||
latest_valid_lsn: startpoint.lsn,
|
||||
ancestor_id: None,
|
||||
ancestor_lsn: None,
|
||||
current_logical_size: 0,
|
||||
current_logical_size_non_incremental: 0,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -387,7 +421,6 @@ fn create_timeline(
|
||||
let timelinedir = conf.timeline_path(&timelineid, tenantid);
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
|
||||
|
||||
2129
pageserver/src/buffered_repository.rs
Normal file
2129
pageserver/src/buffered_repository.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -54,7 +54,52 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/branch/{tenant_id}/{branch_name}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: branch_name
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
get:
|
||||
description: Get branches for tenant
|
||||
responses:
|
||||
"200":
|
||||
description: BranchInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/BranchInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no branch name
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
@@ -203,6 +248,9 @@ components:
|
||||
required:
|
||||
- name
|
||||
- timeline_id
|
||||
- latest_valid_lsn
|
||||
- current_logical_size
|
||||
- current_logical_size_non_incremental
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
@@ -213,6 +261,10 @@ components:
|
||||
type: string
|
||||
ancestor_lsn:
|
||||
type: string
|
||||
current_logical_size:
|
||||
type: integer
|
||||
current_logical_size_non_incremental:
|
||||
type: integer
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -5,6 +5,7 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use routerify::{ext::RequestExt, RouterBuilder};
|
||||
use tracing::*;
|
||||
use zenith_utils::auth::JwtAuth;
|
||||
use zenith_utils::http::endpoint::attach_openapi_ui;
|
||||
use zenith_utils::http::endpoint::auth_middleware;
|
||||
@@ -14,14 +15,14 @@ use zenith_utils::http::{
|
||||
endpoint,
|
||||
error::HttpErrorBody,
|
||||
json::{json_request, json_response},
|
||||
request::get_request_param,
|
||||
request::parse_request_param,
|
||||
};
|
||||
|
||||
use super::models::BranchCreateRequest;
|
||||
use super::models::TenantCreateRequest;
|
||||
use crate::{
|
||||
branches::{self},
|
||||
tenant_mgr, PageServerConf, ZTenantId,
|
||||
};
|
||||
use crate::branches::BranchInfo;
|
||||
use crate::{branches, tenant_mgr, PageServerConf, ZTenantId};
|
||||
|
||||
#[derive(Debug)]
|
||||
struct State {
|
||||
@@ -72,6 +73,7 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
check_permission(&request, Some(request_data.tenant_id))?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered();
|
||||
branches::create_branch(
|
||||
get_config(&request),
|
||||
&request_data.name,
|
||||
@@ -85,20 +87,12 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
}
|
||||
|
||||
async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenantid: ZTenantId = match request.param("tenant_id") {
|
||||
Some(arg) => arg
|
||||
.parse()
|
||||
.map_err(|_| ApiError::BadRequest("failed to parse tenant id".to_string()))?,
|
||||
None => {
|
||||
return Err(ApiError::BadRequest(
|
||||
"no tenant id specified in path param".to_string(),
|
||||
))
|
||||
}
|
||||
};
|
||||
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
|
||||
check_permission(&request, Some(tenantid))?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_list", tenant = %tenantid).entered();
|
||||
crate::branches::get_branches(get_config(&request), &tenantid)
|
||||
})
|
||||
.await
|
||||
@@ -106,14 +100,35 @@ async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
Ok(json_response(StatusCode::OK, response_data)?)
|
||||
}
|
||||
|
||||
// TODO add to swagger
|
||||
async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
|
||||
let conf = get_state(&request).conf;
|
||||
let path = conf.branch_path(&branch_name, &tenantid);
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
BranchInfo::from_path(path, conf, &tenantid, &repo)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
Ok(json_response(StatusCode::OK, response_data)?)
|
||||
}
|
||||
|
||||
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
// check for management permission
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let response_data =
|
||||
tokio::task::spawn_blocking(move || crate::branches::get_tenants(get_config(&request)))
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_list").entered();
|
||||
crate::branches::get_tenants(get_config(&request))
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
Ok(json_response(StatusCode::OK, response_data)?)
|
||||
}
|
||||
|
||||
@@ -124,6 +139,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
|
||||
tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
|
||||
})
|
||||
.await
|
||||
@@ -159,6 +175,7 @@ pub fn make_router(
|
||||
.data(Arc::new(State::new(conf, auth)))
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/branch/:tenant_id", branch_list_handler)
|
||||
.get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler)
|
||||
.post("/v1/branch", branch_create_handler)
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,60 @@ The on-disk format is based on immutable files. The page server
|
||||
receives a stream of incoming WAL, parses the WAL records to determine
|
||||
which pages they apply to, and accumulates the incoming changes in
|
||||
memory. Every now and then, the accumulated changes are written out to
|
||||
new files.
|
||||
new immutable files. This process is called checkpointing. Old versions
|
||||
of on-disk files that are not needed by any timeline are removed by GC
|
||||
process.
|
||||
|
||||
# Terms used in layered repository
|
||||
|
||||
- Relish - one PostgreSQL relation or similarly treated file.
|
||||
- Segment - one slice of a Relish that is stored in a LayeredTimeline.
|
||||
- Layer - specific version of a relish Segment in a range of LSNs.
|
||||
|
||||
Layers can be InMemory or OnDisk:
|
||||
- InMemory layer is not durably stored and needs to rebuild from WAL on pageserver start.
|
||||
- OnDisk layer is durably stored.
|
||||
|
||||
OnDisk layers can be Image or Delta:
|
||||
- ImageLayer represents an image or a snapshot of a segment at one particular LSN.
|
||||
- DeltaLayer represents a collection of WAL records or page images in a range of LSNs.
|
||||
|
||||
Dropped segments are always represented on disk by DeltaLayer.
|
||||
|
||||
LSN range defined by start_lsn and end_lsn:
|
||||
- start_lsn is inclusive.
|
||||
- end_lsn is exclusive.
|
||||
|
||||
For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen
|
||||
in-memory layer or a delta layer, it is a valid end bound. An image
|
||||
layer represents snapshot at one LSN, so end_lsn is always the
|
||||
snapshot LSN + 1
|
||||
|
||||
Layers can be open or historical:
|
||||
- Open layer is a writeable one. Only InMemory layer can be open.
|
||||
FIXME: If open layer is dropped, it is not writeable, so it should be turned into historical,
|
||||
but now it is not implemented - see bug #569.
|
||||
- Historical layer is the one that cannot be modified anymore. Now only OnDisk layers can be historical.
|
||||
|
||||
- LayerMap - a map that tracks what layers exist for all the relishes in a timeline.
|
||||
|
||||
LayerMap consists of two data structures:
|
||||
- segs - All the layers keyed by segment tag
|
||||
- open_layers - data structure that hold all open layers ordered by oldest_pending_lsn for quick access during checkpointing. oldest_pending_lsn is the LSN of the oldest page version stored in this layer.
|
||||
|
||||
All operations that update InMemory Layers should update both structures to keep them up-to-date.
|
||||
|
||||
- LayeredTimeline - implements Timeline interface.
|
||||
|
||||
All methods of LayeredTimeline are aware of its ancestors and return data taking them into account.
|
||||
TODO: Are there any exceptions to this?
|
||||
For example, timeline.list_rels(lsn) will return all segments that are visible in this timeline at the LSN,
|
||||
including ones that were not modified in this timeline and thus don't have a layer in the timeline's LayerMap.
|
||||
|
||||
TODO:
|
||||
Describe GC and checkpoint interval settings.
|
||||
|
||||
# Layer files (On-disk layers)
|
||||
|
||||
The files are called "layer files". Each layer file corresponds
|
||||
to one RELISH_SEG_SIZE slice of a PostgreSQL relation fork or
|
||||
|
||||
@@ -16,14 +16,14 @@ pub fn read_blob(reader: &BoundedReader<&'_ File>, range: &BlobRange) -> Result<
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub struct BlobWriter {
|
||||
writer: ChapterWriter<File>,
|
||||
pub struct BlobWriter<W> {
|
||||
writer: ChapterWriter<W>,
|
||||
offset: u64,
|
||||
}
|
||||
|
||||
impl BlobWriter {
|
||||
impl<W: Write> BlobWriter<W> {
|
||||
// This function takes a BookWriter and creates a new chapter to ensure offset is 0.
|
||||
pub fn new(book_writer: BookWriter<File>, chapter_id: impl Into<ChapterId>) -> Self {
|
||||
pub fn new(book_writer: BookWriter<W>, chapter_id: impl Into<ChapterId>) -> Self {
|
||||
let writer = book_writer.new_chapter(chapter_id);
|
||||
Self { writer, offset: 0 }
|
||||
}
|
||||
@@ -39,7 +39,7 @@ impl BlobWriter {
|
||||
Ok(range)
|
||||
}
|
||||
|
||||
pub fn close(self) -> bookfile::Result<BookWriter<File>> {
|
||||
pub fn close(self) -> bookfile::Result<BookWriter<W>> {
|
||||
self.writer.close()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,27 +39,24 @@
|
||||
//!
|
||||
use crate::layered_repository::blob::BlobWriter;
|
||||
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
|
||||
};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::layered_repository::storage_layer::{Layer, SegmentTag};
|
||||
use crate::repository::{PageReconstructData, PageReconstructResult, PageVersion};
|
||||
use crate::waldecoder;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::Bytes;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
|
||||
use bookfile::{Book, BookWriter};
|
||||
|
||||
@@ -69,7 +66,7 @@ use zenith_utils::lsn::Lsn;
|
||||
use super::blob::{read_blob, BlobRange};
|
||||
|
||||
// Magic constant to identify a Zenith delta file
|
||||
static DELTA_FILE_MAGIC: u32 = 0x5A616E01;
|
||||
pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01;
|
||||
|
||||
/// Mapping from (block #, lsn) -> page/WAL record
|
||||
/// byte ranges in PAGE_VERSIONS_CHAPTER
|
||||
@@ -79,10 +76,34 @@ static PAGE_VERSION_METAS_CHAPTER: u64 = 1;
|
||||
static PAGE_VERSIONS_CHAPTER: u64 = 2;
|
||||
static REL_SIZES_CHAPTER: u64 = 3;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PageVersionMeta {
|
||||
page_image_range: Option<BlobRange>,
|
||||
record_range: Option<BlobRange>,
|
||||
/// Contains the [`Summary`] struct
|
||||
static SUMMARY_CHAPTER: u64 = 4;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
struct Summary {
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
seg: SegmentTag,
|
||||
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
|
||||
dropped: bool,
|
||||
}
|
||||
|
||||
impl From<&DeltaLayer> for Summary {
|
||||
fn from(layer: &DeltaLayer) -> Self {
|
||||
Self {
|
||||
tenantid: layer.tenantid,
|
||||
timelineid: layer.timelineid,
|
||||
seg: layer.seg,
|
||||
|
||||
start_lsn: layer.start_lsn,
|
||||
end_lsn: layer.end_lsn,
|
||||
|
||||
dropped: layer.dropped,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -109,9 +130,6 @@ pub struct DeltaLayer {
|
||||
|
||||
dropped: bool,
|
||||
|
||||
/// Predecessor layer
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
|
||||
inner: Mutex<DeltaLayerInner>,
|
||||
}
|
||||
|
||||
@@ -122,13 +140,17 @@ pub struct DeltaLayerInner {
|
||||
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: BTreeMap<(u32, Lsn), PageVersionMeta>,
|
||||
page_version_metas: VecMap<(u32, Lsn), BlobRange>,
|
||||
|
||||
/// `relsizes` tracks the size of the relation at different points in time.
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
}
|
||||
@@ -150,15 +172,7 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
}
|
||||
.to_string(),
|
||||
)
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
@@ -168,7 +182,7 @@ impl Layer for DeltaLayer {
|
||||
lsn: Lsn,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
let mut cont_lsn: Option<Lsn> = Some(lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
@@ -182,31 +196,30 @@ impl Layer for DeltaLayer {
|
||||
// Scan the metadata BTreeMap backwards, starting from the given entry.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let mut iter = inner
|
||||
let iter = inner
|
||||
.page_version_metas
|
||||
.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
|
||||
if let Some(img_range) = &entry.page_image_range {
|
||||
// Found a page image, return it
|
||||
let img = Bytes::from(read_blob(&page_version_reader, img_range)?);
|
||||
reconstruct_data.page_img = Some(img);
|
||||
cont_lsn = None;
|
||||
break;
|
||||
} else if let Some(rec_range) = &entry.record_range {
|
||||
let rec = WALRecord::des(&read_blob(&page_version_reader, rec_range)?)?;
|
||||
let will_init = rec.will_init;
|
||||
reconstruct_data.records.push(rec);
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
cont_lsn = None;
|
||||
.slice_range((Included(&minkey), Included(&maxkey)))
|
||||
.iter()
|
||||
.rev();
|
||||
for ((_blknum, pv_lsn), blob_range) in iter {
|
||||
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
// Found a page image, return it
|
||||
reconstruct_data.page_img = Some(img);
|
||||
need_image = false;
|
||||
break;
|
||||
} else {
|
||||
// This WAL record needs to be applied against an older page image
|
||||
cont_lsn = Some(*entry_lsn);
|
||||
}
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
PageVersion::Wal(rec) => {
|
||||
let will_init = rec.will_init;
|
||||
reconstruct_data.records.push((*pv_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -214,16 +227,9 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know about the predecessor layer.
|
||||
if let Some(cont_lsn) = cont_lsn {
|
||||
if let Some(cont_layer) = &self.predecessor {
|
||||
Ok(PageReconstructResult::Continue(
|
||||
cont_lsn,
|
||||
Arc::clone(cont_layer),
|
||||
))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Missing(cont_lsn))
|
||||
}
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
@@ -232,21 +238,22 @@ impl Layer for DeltaLayer {
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let inner = self.load()?;
|
||||
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
let slice = inner
|
||||
.relsizes
|
||||
.slice_range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
let result;
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
result = *entry;
|
||||
// Use the base image if needed
|
||||
} else if let Some(predecessor) = &self.predecessor {
|
||||
result = predecessor.get_seg_size(lsn)?;
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
Ok(*entry)
|
||||
} else {
|
||||
result = 0;
|
||||
Err(anyhow::anyhow!("could not find seg size in delta layer"))
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
@@ -266,8 +273,8 @@ impl Layer for DeltaLayer {
|
||||
///
|
||||
fn unload(&self) -> Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.page_version_metas = BTreeMap::new();
|
||||
inner.relsizes = BTreeMap::new();
|
||||
inner.page_version_metas = VecMap::default();
|
||||
inner.relsizes = VecMap::default();
|
||||
inner.loaded = false;
|
||||
Ok(())
|
||||
}
|
||||
@@ -285,38 +292,41 @@ impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for tli {} seg {} {}-{} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, self.end_lsn
|
||||
"----- delta layer for ten {} tli {} seg {} {}-{} ----",
|
||||
self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn
|
||||
);
|
||||
|
||||
println!("--- relsizes ---");
|
||||
let inner = self.load()?;
|
||||
for (k, v) in inner.relsizes.iter() {
|
||||
for (k, v) in inner.relsizes.as_slice() {
|
||||
println!(" {}: {}", k, v);
|
||||
}
|
||||
println!("--- page versions ---");
|
||||
let (_path, book) = self.open_book()?;
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for (k, v) in inner.page_version_metas.iter() {
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
|
||||
let mut desc = String::new();
|
||||
|
||||
if let Some(page_image_range) = v.page_image_range.as_ref() {
|
||||
let image = read_blob(&chapter, &page_image_range)?;
|
||||
write!(&mut desc, " img {} bytes", image.len())?;
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
let pv = PageVersion::des(&buf)?;
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
rec.rec.len(),
|
||||
rec.will_init,
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
}
|
||||
if let Some(record_range) = v.record_range.as_ref() {
|
||||
let record_bytes = read_blob(&chapter, record_range)?;
|
||||
let rec = WALRecord::des(&record_bytes)?;
|
||||
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
rec.rec.len(),
|
||||
rec.will_init,
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
println!(" blk {} at {}: {}", k.0, k.1, desc);
|
||||
|
||||
println!(" blk {} at {}: {}", blk, lsn, desc);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -324,18 +334,17 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
},
|
||||
)
|
||||
/// debugging function to print out the contents of the layer
|
||||
pub fn versions(&self) -> Result<Vec<(u32, Lsn, PageVersion)>> {
|
||||
let mut versions: Vec<(u32, Lsn, PageVersion)> = Vec::new();
|
||||
let inner = self.load()?;
|
||||
let (_path, book) = self.open_book()?;
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
versions.push((*blk, *lsn, PageVersion::des(&buf)?));
|
||||
}
|
||||
Ok(versions)
|
||||
}
|
||||
|
||||
fn path_for(
|
||||
@@ -352,14 +361,15 @@ impl DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new delta file, using the given btreemaps containing the page versions and
|
||||
/// relsizes.
|
||||
/// Create a new delta file, using the given page versions and relsizes.
|
||||
/// The page versions are passed by an iterator; the iterator must return
|
||||
/// page versions in blknum+lsn order.
|
||||
///
|
||||
/// This is used to write the in-memory layer to disk. The in-memory layer uses the same
|
||||
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
|
||||
/// expedient.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn create(
|
||||
pub fn create<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
@@ -367,10 +377,14 @@ impl DeltaLayer {
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
page_versions: impl Iterator<Item = (u32, Lsn, &'a PageVersion)>,
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
nosync: bool,
|
||||
) -> Result<DeltaLayer> {
|
||||
if seg.rel.is_blocky() {
|
||||
assert!(!relsizes.is_empty());
|
||||
}
|
||||
|
||||
let delta_layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
@@ -381,10 +395,9 @@ impl DeltaLayer {
|
||||
dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: true,
|
||||
page_version_metas: BTreeMap::new(),
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes,
|
||||
}),
|
||||
predecessor,
|
||||
};
|
||||
let mut inner = delta_layer.inner.lock().unwrap();
|
||||
|
||||
@@ -394,51 +407,54 @@ impl DeltaLayer {
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let file = File::create(&path)?;
|
||||
let book = BookWriter::new(file, DELTA_FILE_MAGIC)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
|
||||
|
||||
let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);
|
||||
|
||||
for (key, page_version) in page_versions {
|
||||
let page_image_range = page_version
|
||||
.page_image
|
||||
.map(|page_image| page_version_writer.write_blob(page_image.as_ref()))
|
||||
.transpose()?;
|
||||
for (blknum, lsn, page_version) in page_versions {
|
||||
let buf = PageVersion::ser(page_version)?;
|
||||
let blob_range = page_version_writer.write_blob(&buf)?;
|
||||
|
||||
let record_range = page_version
|
||||
.record
|
||||
.map(|record| {
|
||||
let buf = WALRecord::ser(&record)?;
|
||||
page_version_writer.write_blob(&buf)
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
let old = inner.page_version_metas.insert(
|
||||
key,
|
||||
PageVersionMeta {
|
||||
page_image_range,
|
||||
record_range,
|
||||
},
|
||||
);
|
||||
|
||||
assert!(old.is_none());
|
||||
inner
|
||||
.page_version_metas
|
||||
.append((blknum, lsn), blob_range)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let book = page_version_writer.close()?;
|
||||
|
||||
// Write out page versions
|
||||
let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
|
||||
let buf = BTreeMap::ser(&inner.page_version_metas)?;
|
||||
let buf = VecMap::ser(&inner.page_version_metas)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// and relsizes to separate chapter
|
||||
let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
|
||||
let buf = BTreeMap::ser(&inner.relsizes)?;
|
||||
let buf = VecMap::ser(&inner.relsizes)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
book.close()?;
|
||||
let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
|
||||
let summary = Summary {
|
||||
tenantid,
|
||||
timelineid,
|
||||
seg,
|
||||
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
|
||||
dropped,
|
||||
};
|
||||
Summary::ser_into(&summary, &mut chapter)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
if !nosync {
|
||||
writer.get_ref().sync_all()?;
|
||||
}
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
drop(inner);
|
||||
@@ -447,17 +463,7 @@ impl DeltaLayer {
|
||||
}
|
||||
|
||||
fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
|
||||
let path = Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
},
|
||||
);
|
||||
let path = self.path();
|
||||
|
||||
let file = File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
@@ -478,11 +484,36 @@ impl DeltaLayer {
|
||||
|
||||
let (path, book) = self.open_book()?;
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let actual_summary = Summary::des(&chapter)?;
|
||||
|
||||
let expected_summary = Summary::from(self);
|
||||
|
||||
if actual_summary != expected_summary {
|
||||
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = Path::new(path.file_name().unwrap());
|
||||
let expected_filename = self.filename();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
"warning: filename does not match what is expected from in-file summary"
|
||||
);
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
|
||||
let page_version_metas = BTreeMap::des(&chapter)?;
|
||||
let page_version_metas = VecMap::des(&chapter)?;
|
||||
|
||||
let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
|
||||
let relsizes = BTreeMap::des(&chapter)?;
|
||||
let relsizes = VecMap::des(&chapter)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
@@ -501,7 +532,6 @@ impl DeltaLayer {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
filename: &DeltaFileName,
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
) -> DeltaLayer {
|
||||
DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
@@ -513,36 +543,51 @@ impl DeltaLayer {
|
||||
dropped: filename.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes: BTreeMap::new(),
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
}),
|
||||
predecessor,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
||||
pub fn new_for_path(
|
||||
path: &Path,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
filename: &DeltaFileName,
|
||||
) -> DeltaLayer {
|
||||
DeltaLayer {
|
||||
pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<Self> {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let summary = Summary::des(&chapter)?;
|
||||
|
||||
Ok(DeltaLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg: filename.seg,
|
||||
start_lsn: filename.start_lsn,
|
||||
end_lsn: filename.end_lsn,
|
||||
dropped: filename.dropped,
|
||||
timelineid: summary.timelineid,
|
||||
tenantid: summary.tenantid,
|
||||
seg: summary.seg,
|
||||
start_lsn: summary.start_lsn,
|
||||
end_lsn: summary.end_lsn,
|
||||
dropped: summary.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes: BTreeMap::new(),
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
}),
|
||||
predecessor: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> DeltaFileName {
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
}
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ impl DeltaFileName {
|
||||
/// Parse a string as a delta file name. Returns None if the filename does not
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn from_str(fname: &str) -> Option<Self> {
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
@@ -168,7 +168,7 @@ impl ImageFileName {
|
||||
/// Parse a string as an image file name. Returns None if the filename does not
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn from_str(fname: &str) -> Option<Self> {
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
@@ -286,11 +286,11 @@ pub fn list_files(
|
||||
let fname = direntry?.file_name();
|
||||
let fname = fname.to_str().unwrap();
|
||||
|
||||
if let Some(deltafilename) = DeltaFileName::from_str(fname) {
|
||||
if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
|
||||
deltafiles.push(deltafilename);
|
||||
} else if let Some(imgfilename) = ImageFileName::from_str(fname) {
|
||||
} else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
|
||||
imgfiles.push(imgfilename);
|
||||
} else if fname == "wal" || fname == "metadata" || fname == "ancestor" {
|
||||
} else if fname == "metadata" || fname == "ancestor" || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
} else {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
|
||||
@@ -22,34 +22,57 @@
|
||||
//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
|
||||
//!
|
||||
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, SegmentTag,
|
||||
};
|
||||
use crate::layered_repository::LayeredTimeline;
|
||||
use crate::layered_repository::RELISH_SEG_SIZE;
|
||||
use crate::layered_repository::storage_layer::{Layer, SegmentTag, RELISH_SEG_SIZE};
|
||||
use crate::repository::{PageReconstructData, PageReconstructResult};
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{anyhow, ensure, Result};
|
||||
use anyhow::{anyhow, bail, ensure, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::convert::TryInto;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
|
||||
use bookfile::{Book, BookWriter};
|
||||
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Magic constant to identify a Zenith segment image file
|
||||
const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1;
|
||||
pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1;
|
||||
|
||||
/// Contains each block in block # order
|
||||
const BLOCKY_IMAGES_CHAPTER: u64 = 1;
|
||||
const NONBLOCKY_IMAGE_CHAPTER: u64 = 2;
|
||||
|
||||
/// Contains the [`Summary`] struct
|
||||
const SUMMARY_CHAPTER: u64 = 3;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
struct Summary {
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
seg: SegmentTag,
|
||||
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl From<&ImageLayer> for Summary {
|
||||
fn from(layer: &ImageLayer) -> Self {
|
||||
Self {
|
||||
tenantid: layer.tenantid,
|
||||
timelineid: layer.timelineid,
|
||||
seg: layer.seg,
|
||||
|
||||
lsn: layer.lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const BLOCK_SIZE: usize = 8192;
|
||||
|
||||
///
|
||||
@@ -88,13 +111,11 @@ pub struct ImageLayerInner {
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
}
|
||||
.to_string(),
|
||||
)
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
@@ -114,7 +135,8 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
// End-bound is exclusive
|
||||
self.lsn + 1
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
@@ -194,8 +216,8 @@ impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for tli {} seg {} at {} ----",
|
||||
self.timelineid, self.seg, self.lsn
|
||||
"----- image layer for ten {} tli {} seg {} at {} ----",
|
||||
self.tenantid, self.timelineid, self.seg, self.lsn
|
||||
);
|
||||
|
||||
let inner = self.load()?;
|
||||
@@ -214,18 +236,6 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn path_for(
|
||||
path_or_conf: &PathOrConf,
|
||||
timelineid: ZTimelineId,
|
||||
@@ -241,13 +251,14 @@ impl ImageLayer {
|
||||
}
|
||||
|
||||
/// Create a new image file, using the given array of pages.
|
||||
fn create(
|
||||
pub fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
lsn: Lsn,
|
||||
base_images: Vec<Bytes>,
|
||||
nosync: bool,
|
||||
) -> Result<ImageLayer> {
|
||||
let image_type = if seg.rel.is_blocky() {
|
||||
let num_blocks: u32 = base_images.len().try_into()?;
|
||||
@@ -272,11 +283,11 @@ impl ImageLayer {
|
||||
|
||||
// Write the images into a file
|
||||
let path = layer.path();
|
||||
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let file = File::create(&path)?;
|
||||
let book = BookWriter::new(file, IMAGE_FILE_MAGIC)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
|
||||
|
||||
let book = match &image_type {
|
||||
ImageType::Blocky { .. } => {
|
||||
@@ -294,15 +305,30 @@ impl ImageLayer {
|
||||
}
|
||||
};
|
||||
|
||||
book.close()?;
|
||||
let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
|
||||
let summary = Summary {
|
||||
tenantid,
|
||||
timelineid,
|
||||
seg,
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
lsn,
|
||||
};
|
||||
Summary::ser_into(&summary, &mut chapter)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
if !nosync {
|
||||
writer.get_ref().sync_all()?;
|
||||
}
|
||||
trace!("saved {}", path.display());
|
||||
|
||||
drop(inner);
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
/*
|
||||
// Create a new image file by materializing every page in a source layer
|
||||
// at given LSN.
|
||||
pub fn create_from_src(
|
||||
@@ -340,6 +366,7 @@ impl ImageLayer {
|
||||
|
||||
Self::create(conf, timelineid, timeline.tenantid, seg, lsn, base_images)
|
||||
}
|
||||
*/
|
||||
|
||||
///
|
||||
/// Load the contents of the file into memory
|
||||
@@ -354,6 +381,31 @@ impl ImageLayer {
|
||||
|
||||
let (path, book) = self.open_book()?;
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let actual_summary = Summary::des(&chapter)?;
|
||||
|
||||
let expected_summary = Summary::from(self);
|
||||
|
||||
if actual_summary != expected_summary {
|
||||
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = Path::new(path.file_name().unwrap());
|
||||
let expected_filename = self.filename();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
"warning: filename does not match what is expected from in-file summary"
|
||||
);
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let image_type = if self.seg.rel.is_blocky() {
|
||||
let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
let images_len = chapter.len();
|
||||
@@ -376,15 +428,7 @@ impl ImageLayer {
|
||||
}
|
||||
|
||||
fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
|
||||
let path = Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
},
|
||||
);
|
||||
let path = self.path();
|
||||
|
||||
let file = File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
@@ -415,22 +459,37 @@ impl ImageLayer {
|
||||
/// Create an ImageLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
||||
pub fn new_for_path(
|
||||
path: &Path,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
filename: &ImageFileName,
|
||||
) -> ImageLayer {
|
||||
ImageLayer {
|
||||
pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<ImageLayer> {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let summary = Summary::des(&chapter)?;
|
||||
|
||||
Ok(ImageLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg: filename.seg,
|
||||
lsn: filename.lsn,
|
||||
timelineid: summary.timelineid,
|
||||
tenantid: summary.tenantid,
|
||||
seg: summary.seg,
|
||||
lsn: summary.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> ImageFileName {
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
}
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,17 +12,17 @@ use crate::layered_repository::{DeltaLayer, ImageLayer};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, Result};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::{Arc, RwLock};
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::page_versions::PageVersions;
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
@@ -31,8 +31,7 @@ pub struct InMemoryLayer {
|
||||
|
||||
///
|
||||
/// This layer contains all the changes from 'start_lsn'. The
|
||||
/// start is inclusive. There is no end LSN; we only use in-memory
|
||||
/// layer at the end of a timeline.
|
||||
/// start is inclusive.
|
||||
///
|
||||
start_lsn: Lsn,
|
||||
|
||||
@@ -41,37 +40,51 @@ pub struct InMemoryLayer {
|
||||
|
||||
/// The above fields never change. The parts that do change are in 'inner',
|
||||
/// and protected by mutex.
|
||||
inner: Mutex<InMemoryLayerInner>,
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
|
||||
/// Predecessor layer
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
/// Predecessor layer might be needed?
|
||||
incremental: bool,
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// Frozen in-memory layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is None
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
/// If this relation was dropped, remember when that happened.
|
||||
drop_lsn: Option<Lsn>,
|
||||
/// The drop LSN is recorded in [`end_lsn`].
|
||||
dropped: bool,
|
||||
|
||||
///
|
||||
/// All versions of all pages in the layer are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
///
|
||||
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||
page_versions: PageVersions,
|
||||
|
||||
///
|
||||
/// `segsizes` tracks the size of the segment at different points in time.
|
||||
///
|
||||
segsizes: BTreeMap<Lsn, u32>,
|
||||
/// For a blocky rel, there is always one entry, at the layer's start_lsn,
|
||||
/// so that determining the size never depends on the predecessor layer. For
|
||||
/// a non-blocky rel, 'segsizes' is not used and is always empty.
|
||||
///
|
||||
segsizes: VecMap<Lsn, u32>,
|
||||
}
|
||||
|
||||
impl InMemoryLayerInner {
|
||||
fn assert_writeable(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
}
|
||||
|
||||
fn get_seg_size(&self, lsn: Lsn) -> u32 {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let mut iter = self.segsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
let slice = self.segsizes.slice_range(..=lsn);
|
||||
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
// We make sure there is always at least one entry
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
*entry
|
||||
} else {
|
||||
0
|
||||
panic!("could not find seg size in in-memory layer");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -80,23 +93,20 @@ impl Layer for InMemoryLayer {
|
||||
// An in-memory layer doesn't really have a filename as it's not stored on disk,
|
||||
// but we construct a filename as if it was a delta layer
|
||||
fn filename(&self) -> PathBuf {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn;
|
||||
let dropped;
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
if let Some(drop_lsn) = inner.end_lsn {
|
||||
end_lsn = drop_lsn;
|
||||
dropped = true;
|
||||
} else {
|
||||
end_lsn = Lsn(u64::MAX);
|
||||
dropped = false;
|
||||
}
|
||||
|
||||
let delta_filename = DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
dropped: inner.dropped,
|
||||
}
|
||||
.to_string();
|
||||
|
||||
@@ -116,18 +126,18 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
drop_lsn
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
end_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
inner.drop_lsn.is_some()
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.dropped
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
@@ -137,53 +147,46 @@ impl Layer for InMemoryLayer {
|
||||
lsn: Lsn,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
let mut cont_lsn: Option<Lsn> = Some(lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
{
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// Scan the BTreeMap backwards, starting from reconstruct_data.lsn.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let mut iter = inner
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let iter = inner
|
||||
.page_versions
|
||||
.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
|
||||
.get_block_lsn_range(blknum, ..=lsn)
|
||||
.iter()
|
||||
.rev();
|
||||
for (entry_lsn, entry) in iter {
|
||||
if let Some(img) = &entry.page_image {
|
||||
reconstruct_data.page_img = Some(img.clone());
|
||||
cont_lsn = None;
|
||||
need_image = false;
|
||||
break;
|
||||
} else if let Some(rec) = &entry.record {
|
||||
reconstruct_data.records.push(rec.clone());
|
||||
reconstruct_data.records.push((*entry_lsn, rec.clone()));
|
||||
if rec.will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
cont_lsn = None;
|
||||
need_image = false;
|
||||
break;
|
||||
} else {
|
||||
// This WAL record needs to be applied against an older page image
|
||||
cont_lsn = Some(*entry_lsn);
|
||||
}
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know about the predecessor layer.
|
||||
if let Some(cont_lsn) = cont_lsn {
|
||||
if let Some(cont_layer) = &self.predecessor {
|
||||
Ok(PageReconstructResult::Continue(
|
||||
cont_lsn,
|
||||
Arc::clone(cont_layer),
|
||||
))
|
||||
// caller know
|
||||
if need_image {
|
||||
if self.incremental {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Missing(cont_lsn))
|
||||
Ok(PageReconstructResult::Missing(self.start_lsn))
|
||||
}
|
||||
} else {
|
||||
Ok(PageReconstructResult::Complete)
|
||||
@@ -193,18 +196,27 @@ impl Layer for InMemoryLayer {
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
Ok(inner.get_seg_size(lsn))
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// If the segment created after requested LSN,
|
||||
// it doesn't exist in the layer. But we shouldn't
|
||||
// have requested it in the first place.
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
// Is the requested LSN after the segment was dropped?
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
if lsn >= drop_lsn {
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
if lsn >= end_lsn {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
@@ -227,35 +239,35 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.predecessor.is_some()
|
||||
self.incremental
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_str = inner
|
||||
.drop_lsn
|
||||
.end_lsn
|
||||
.as_ref()
|
||||
.map(|drop_lsn| drop_lsn.to_string())
|
||||
.map(Lsn::to_string)
|
||||
.unwrap_or_default();
|
||||
|
||||
println!(
|
||||
"----- in-memory layer for tli {} seg {} {}-{} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str
|
||||
"----- in-memory layer for tli {} seg {} {}-{} {} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
|
||||
);
|
||||
|
||||
for (k, v) in inner.segsizes.iter() {
|
||||
for (k, v) in inner.segsizes.as_slice() {
|
||||
println!("segsizes {}: {}", k, v);
|
||||
}
|
||||
|
||||
for (k, v) in inner.page_versions.iter() {
|
||||
for (blknum, lsn, pv) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
println!(
|
||||
"blk {} at {}: {}/{}\n",
|
||||
k.0,
|
||||
k.1,
|
||||
v.page_image.is_some(),
|
||||
v.record.is_some()
|
||||
blknum,
|
||||
lsn,
|
||||
pv.page_image.is_some(),
|
||||
pv.record.is_some()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -263,9 +275,17 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
// Type alias to simplify InMemoryLayer::freeze signature
|
||||
//
|
||||
type SuccessorLayers = (Vec<Arc<dyn Layer>>, Option<Arc<InMemoryLayer>>);
|
||||
/// A result of an inmemory layer data being written to disk.
|
||||
pub struct LayersOnDisk {
|
||||
pub delta_layers: Vec<DeltaLayer>,
|
||||
pub image_layers: Vec<ImageLayer>,
|
||||
}
|
||||
|
||||
impl LayersOnDisk {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.delta_layers.is_empty() && self.image_layers.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
/// Return the oldest page version that's stored in this layer
|
||||
@@ -291,6 +311,12 @@ impl InMemoryLayer {
|
||||
start_lsn
|
||||
);
|
||||
|
||||
// The segment is initially empty, so initialize 'segsizes' with 0.
|
||||
let mut segsizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
segsizes.append(start_lsn, 0).unwrap();
|
||||
}
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
@@ -298,22 +324,23 @@ impl InMemoryLayer {
|
||||
seg,
|
||||
start_lsn,
|
||||
oldest_pending_lsn,
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
segsizes: BTreeMap::new(),
|
||||
incremental: false,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
page_versions: PageVersions::default(),
|
||||
segsizes,
|
||||
}),
|
||||
predecessor: None,
|
||||
})
|
||||
}
|
||||
|
||||
// Write operations
|
||||
|
||||
/// Remember new page version, as a WAL record over previous version
|
||||
pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> Result<()> {
|
||||
pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> u32 {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
rec.lsn,
|
||||
lsn,
|
||||
PageVersion {
|
||||
page_image: None,
|
||||
record: Some(rec),
|
||||
@@ -322,7 +349,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Remember new page version, as a full page image
|
||||
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> {
|
||||
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> u32 {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
lsn,
|
||||
@@ -335,7 +362,7 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()> {
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> u32 {
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
trace!(
|
||||
@@ -345,9 +372,11 @@ impl InMemoryLayer {
|
||||
self.timelineid,
|
||||
lsn
|
||||
);
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
let old = inner.page_versions.insert((blknum, lsn), pv);
|
||||
inner.assert_writeable();
|
||||
|
||||
let old = inner.page_versions.append_or_update_last(blknum, lsn, pv);
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
@@ -361,7 +390,7 @@ impl InMemoryLayer {
|
||||
if self.seg.rel.is_blocky() {
|
||||
let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1;
|
||||
|
||||
// use inner get_seg_size, since calling self.get_seg_size will try to acquire self.inner.lock
|
||||
// use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock,
|
||||
// which we've just acquired above
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
if newsize > oldsize {
|
||||
@@ -387,11 +416,14 @@ impl InMemoryLayer {
|
||||
page_image: Some(ZERO_PAGE.clone()),
|
||||
record: None,
|
||||
};
|
||||
println!(
|
||||
trace!(
|
||||
"filling gap blk {} with zeros for write of {}",
|
||||
gapblknum, blknum
|
||||
gapblknum,
|
||||
blknum
|
||||
);
|
||||
let old = inner.page_versions.insert((gapblknum, lsn), zeropv);
|
||||
let old = inner
|
||||
.page_versions
|
||||
.append_or_update_last(gapblknum, lsn, zeropv);
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
|
||||
if old.is_some() {
|
||||
@@ -402,36 +434,47 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
inner.segsizes.insert(lsn, newsize);
|
||||
inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
|
||||
return newsize - oldsize;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
0
|
||||
}
|
||||
|
||||
/// Remember that the relation was truncated at given LSN
|
||||
pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> anyhow::Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
let old = inner.segsizes.insert(lsn, segsize);
|
||||
pub fn put_truncation(&self, lsn: Lsn, segsize: u32) {
|
||||
assert!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"put_truncation() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.assert_writeable();
|
||||
|
||||
// check that this we truncate to a smaller size than segment was before the truncation
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
assert!(segsize < oldsize);
|
||||
|
||||
let old = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Inserting truncation, but had an entry for the LSN already");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember that the segment was dropped at given LSN
|
||||
pub fn drop_segment(&self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
pub fn drop_segment(&self, lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
assert!(inner.drop_lsn.is_none());
|
||||
inner.drop_lsn = Some(lsn);
|
||||
assert!(inner.end_lsn.is_none());
|
||||
assert!(!inner.dropped);
|
||||
inner.dropped = true;
|
||||
assert!(self.start_lsn < lsn);
|
||||
inner.end_lsn = Some(lsn);
|
||||
|
||||
info!("dropped segment {} at {}", self.seg, lsn);
|
||||
|
||||
Ok(())
|
||||
trace!("dropped segment {} at {}", self.seg, lsn);
|
||||
}
|
||||
|
||||
///
|
||||
@@ -448,6 +491,9 @@ impl InMemoryLayer {
|
||||
) -> Result<InMemoryLayer> {
|
||||
let seg = src.get_seg_tag();
|
||||
|
||||
assert!(oldest_pending_lsn.is_aligned());
|
||||
assert!(oldest_pending_lsn >= start_lsn);
|
||||
|
||||
trace!(
|
||||
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
|
||||
seg,
|
||||
@@ -455,11 +501,11 @@ impl InMemoryLayer {
|
||||
start_lsn,
|
||||
);
|
||||
|
||||
// For convenience, copy the segment size from the predecessor layer
|
||||
let mut segsizes = BTreeMap::new();
|
||||
// Copy the segment size at the start LSN from the predecessor layer.
|
||||
let mut segsizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
let size = src.get_seg_size(start_lsn)?;
|
||||
segsizes.insert(start_lsn, size);
|
||||
segsizes.append(start_lsn, size).unwrap();
|
||||
}
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
@@ -469,100 +515,108 @@ impl InMemoryLayer {
|
||||
seg,
|
||||
start_lsn,
|
||||
oldest_pending_lsn,
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
incremental: true,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
page_versions: PageVersions::default(),
|
||||
segsizes,
|
||||
}),
|
||||
predecessor: Some(src),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_writeable(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.end_lsn.is_none()
|
||||
}
|
||||
|
||||
/// Make the layer non-writeable. Only call once.
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is inclusive
|
||||
pub fn freeze(&self, end_lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
if inner.end_lsn.is_some() {
|
||||
assert!(inner.dropped);
|
||||
} else {
|
||||
assert!(!inner.dropped);
|
||||
assert!(self.start_lsn < end_lsn + 1);
|
||||
inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
|
||||
|
||||
if let Some((lsn, _)) = inner.segsizes.as_slice().last() {
|
||||
assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
|
||||
}
|
||||
|
||||
for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
assert!(lsn <= end_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Write the this frozen in-memory layer to disk.
|
||||
///
|
||||
/// Write the this in-memory layer to disk.
|
||||
///
|
||||
/// The cutoff point for the layer that's written to disk is 'end_lsn'.
|
||||
///
|
||||
/// Returns new layers that replace this one. Always returns a new image
|
||||
/// layer containing the page versions at the cutoff LSN, that were written
|
||||
/// to disk, and usually also a DeltaLayer that includes all the WAL records
|
||||
/// between start LSN and the cutoff. (The delta layer is not needed when
|
||||
/// a new relish is created with a single LSN, so that the start and end LSN
|
||||
/// are the same.) If there were page versions newer than 'end_lsn', also
|
||||
/// returns a new in-memory layer containing those page versions. The caller
|
||||
/// replaces this layer with the returned layers in the layer map.
|
||||
///
|
||||
pub fn freeze(
|
||||
&self,
|
||||
cutoff_lsn: Lsn,
|
||||
// This is needed just to call materialize_page()
|
||||
timeline: &LayeredTimeline,
|
||||
) -> Result<SuccessorLayers> {
|
||||
info!(
|
||||
"freezing in memory layer for {} on timeline {} at {}",
|
||||
self.seg, self.timelineid, cutoff_lsn
|
||||
/// Returns new layers that replace this one.
|
||||
/// If not dropped, returns a new image layer containing the page versions
|
||||
/// at the `end_lsn`. Can also return a DeltaLayer that includes all the
|
||||
/// WAL records between start and end LSN. (The delta layer is not needed
|
||||
/// when a new relish is created with a single LSN, so that the start and
|
||||
/// end LSN are the same.)
|
||||
pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<LayersOnDisk> {
|
||||
trace!(
|
||||
"write_to_disk {} get_end_lsn is {}",
|
||||
self.filename().display(),
|
||||
self.get_end_lsn()
|
||||
);
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
// though: another thread might have grabbed a reference to this layer
|
||||
// in `get_layer_for_write' just before the checkpointer called
|
||||
// `freeze`, and then `write_to_disk` on it. When the thread gets the
|
||||
// lock, it will see that it's not writeable anymore and retry, but it
|
||||
// would have to wait until we release it. That race condition is very
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
let end_lsn_exclusive = inner.end_lsn.unwrap();
|
||||
|
||||
// Normally, use the cutoff LSN as the end of the frozen layer.
|
||||
// But if the relation was dropped, we know that there are no
|
||||
// more changes coming in for it, and in particular we know that
|
||||
// there are no changes "in flight" for the LSN anymore, so we use
|
||||
// the drop LSN instead. The drop-LSN could be ahead of the
|
||||
// caller-specified LSN!
|
||||
let dropped = inner.drop_lsn.is_some();
|
||||
let end_lsn = if dropped {
|
||||
inner.drop_lsn.unwrap()
|
||||
} else {
|
||||
cutoff_lsn
|
||||
};
|
||||
|
||||
// Divide all the page versions into old and new at the 'end_lsn' cutoff point.
|
||||
let mut before_page_versions;
|
||||
let mut before_segsizes;
|
||||
let mut after_page_versions;
|
||||
let mut after_segsizes;
|
||||
if !dropped {
|
||||
before_segsizes = BTreeMap::new();
|
||||
after_segsizes = BTreeMap::new();
|
||||
for (lsn, size) in inner.segsizes.iter() {
|
||||
if *lsn > end_lsn {
|
||||
after_segsizes.insert(*lsn, *size);
|
||||
} else {
|
||||
before_segsizes.insert(*lsn, *size);
|
||||
}
|
||||
}
|
||||
|
||||
before_page_versions = BTreeMap::new();
|
||||
after_page_versions = BTreeMap::new();
|
||||
for ((blknum, lsn), pv) in inner.page_versions.iter() {
|
||||
match lsn.cmp(&end_lsn) {
|
||||
Ordering::Less => {
|
||||
before_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
Ordering::Equal => {
|
||||
// Page versions at the cutoff LSN will be stored in the
|
||||
// materialized image layer.
|
||||
}
|
||||
Ordering::Greater => {
|
||||
after_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
before_page_versions = inner.page_versions.clone();
|
||||
before_segsizes = inner.segsizes.clone();
|
||||
after_segsizes = BTreeMap::new();
|
||||
after_page_versions = BTreeMap::new();
|
||||
if inner.dropped {
|
||||
let delta_layer = DeltaLayer::create(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_exclusive,
|
||||
true,
|
||||
inner.page_versions.ordered_page_version_iter(None),
|
||||
inner.segsizes.clone(),
|
||||
)?;
|
||||
trace!(
|
||||
"freeze: created delta layer for dropped segment {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_exclusive
|
||||
);
|
||||
return Ok(LayersOnDisk {
|
||||
delta_layers: vec![delta_layer],
|
||||
image_layers: Vec::new(),
|
||||
});
|
||||
}
|
||||
|
||||
// we can release the lock now.
|
||||
drop(inner);
|
||||
// Since `end_lsn` is inclusive, subtract 1.
|
||||
// We want to make an ImageLayer for the last included LSN,
|
||||
// so the DeltaLayer should exlcude that LSN.
|
||||
let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
|
||||
|
||||
let mut frozen_layers: Vec<Arc<dyn Layer>> = Vec::new();
|
||||
let mut page_versions = inner
|
||||
.page_versions
|
||||
.ordered_page_version_iter(Some(end_lsn_inclusive));
|
||||
|
||||
if self.start_lsn != end_lsn {
|
||||
let mut delta_layers = Vec::new();
|
||||
|
||||
if self.start_lsn != end_lsn_inclusive {
|
||||
let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive);
|
||||
// Write the page versions before the cutoff to disk.
|
||||
let delta_layer = DeltaLayer::create(
|
||||
self.conf,
|
||||
@@ -570,53 +624,36 @@ impl InMemoryLayer {
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
self.predecessor.clone(),
|
||||
before_page_versions,
|
||||
before_segsizes,
|
||||
end_lsn_inclusive,
|
||||
false,
|
||||
page_versions,
|
||||
segsizes,
|
||||
)?;
|
||||
let delta_layer_rc: Arc<dyn Layer> = Arc::new(delta_layer);
|
||||
frozen_layers.push(delta_layer_rc);
|
||||
delta_layers.push(delta_layer);
|
||||
trace!(
|
||||
"freeze: created delta layer {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn
|
||||
end_lsn_inclusive
|
||||
);
|
||||
} else {
|
||||
assert!(before_page_versions.is_empty());
|
||||
assert!(page_versions.next().is_none());
|
||||
}
|
||||
|
||||
let mut new_open_rc = None;
|
||||
if !dropped {
|
||||
// Write a new base image layer at the cutoff point
|
||||
let imgfile = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
|
||||
let imgfile_rc: Arc<dyn Layer> = Arc::new(imgfile);
|
||||
frozen_layers.push(Arc::clone(&imgfile_rc));
|
||||
trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
|
||||
drop(inner);
|
||||
|
||||
// If there were any page versions newer than the cutoff, initialize a new in-memory
|
||||
// layer to hold them
|
||||
if !after_segsizes.is_empty() || !after_page_versions.is_empty() {
|
||||
let new_open = Self::create_successor_layer(
|
||||
self.conf,
|
||||
imgfile_rc,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
end_lsn,
|
||||
end_lsn,
|
||||
)?;
|
||||
let mut new_inner = new_open.inner.lock().unwrap();
|
||||
new_inner.page_versions.append(&mut after_page_versions);
|
||||
new_inner.segsizes.append(&mut after_segsizes);
|
||||
drop(new_inner);
|
||||
trace!("freeze: created new in-mem layer {} {}-", self.seg, end_lsn);
|
||||
// Write a new base image layer at the cutoff point
|
||||
let image_layer =
|
||||
ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?;
|
||||
trace!(
|
||||
"freeze: created image layer {} at {}",
|
||||
self.seg,
|
||||
end_lsn_inclusive
|
||||
);
|
||||
|
||||
new_open_rc = Some(Arc::new(new_open))
|
||||
}
|
||||
}
|
||||
|
||||
Ok((frozen_layers, new_open_rc))
|
||||
Ok(LayersOnDisk {
|
||||
delta_layers,
|
||||
image_layers: vec![image_layer],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
468
pageserver/src/layered_repository/interval_tree.rs
Normal file
468
pageserver/src/layered_repository/interval_tree.rs
Normal file
@@ -0,0 +1,468 @@
|
||||
///
|
||||
/// IntervalTree is data structure for holding intervals. It is generic
|
||||
/// to make unit testing possible, but the only real user of it is the layer map,
|
||||
///
|
||||
/// It's inspired by the "segment tree" or a "statistic tree" as described in
|
||||
/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold
|
||||
/// the points instead of a binary tree. This is called an "interval tree" instead
|
||||
/// of "segment tree" because the term "segment" is already using Zenith to mean
|
||||
/// something else. To add to the confusion, there is another data structure known
|
||||
/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree),
|
||||
/// for storing intervals, but this isn't that.
|
||||
///
|
||||
/// The basic idea is to have a B-tree of "interesting Points". At each Point,
|
||||
/// there is a list of intervals that contain the point. The Points are formed
|
||||
/// from the start bounds of each interval; there is a Point for each distinct
|
||||
/// start bound.
|
||||
///
|
||||
/// Operations:
|
||||
///
|
||||
/// To find intervals that contain a given point, you search the b-tree to find
|
||||
/// the nearest Point <= search key. Then you just return the list of intervals.
|
||||
///
|
||||
/// To insert an interval, find the Point with start key equal to the inserted item.
|
||||
/// If the Point doesn't exist yet, create it, by copying all the items from the
|
||||
/// previous Point that cover the new Point. Then walk right, inserting the new
|
||||
/// interval to all the Points that are contained by the new interval (including the
|
||||
/// newly created Point).
|
||||
///
|
||||
/// To remove an interval, you scan the tree for all the Points that are contained by
|
||||
/// the removed interval, and remove it from the list in each Point.
|
||||
///
|
||||
/// Requirements and assumptions:
|
||||
///
|
||||
/// - Can store overlapping items
|
||||
/// - But there are not many overlapping items
|
||||
/// - The interval bounds don't change after it is added to the tree
|
||||
/// - Intervals are uniquely identified by pointer equality. You must not be insert the
|
||||
/// same interval object twice, and `remove` uses pointer equality to remove the right
|
||||
/// interval. It is OK to have two intervals with the same bounds, however.
|
||||
///
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct IntervalTree<I: ?Sized>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
points: BTreeMap<I::Key, Point<I>>,
|
||||
}
|
||||
|
||||
struct Point<I: ?Sized> {
|
||||
/// All intervals that contain this point, in no particular order.
|
||||
///
|
||||
/// We assume that there aren't a lot of overlappingg intervals, so that this vector
|
||||
/// never grows very large. If that assumption doesn't hold, we could keep this ordered
|
||||
/// by the end bound, to speed up `search`. But as long as there are only a few elements,
|
||||
/// a linear search is OK.
|
||||
elements: Vec<Arc<I>>,
|
||||
}
|
||||
|
||||
/// Abstraction for an interval that can be stored in the tree
|
||||
///
|
||||
/// The start bound is inclusive and the end bound is exclusive. End must be greater
|
||||
/// than start.
|
||||
pub trait IntervalItem {
|
||||
type Key: Ord + Copy + Debug + Sized;
|
||||
|
||||
fn start_key(&self) -> Self::Key;
|
||||
fn end_key(&self) -> Self::Key;
|
||||
|
||||
fn bounds(&self) -> Range<Self::Key> {
|
||||
self.start_key()..self.end_key()
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: ?Sized> IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
/// Return an element that contains 'key', or precedes it.
|
||||
///
|
||||
/// If there are multiple candidates, returns the one with the highest 'end' key.
|
||||
pub fn search(&self, key: I::Key) -> Option<Arc<I>> {
|
||||
// Find the greatest point that precedes or is equal to the search key. If there is
|
||||
// none, returns None.
|
||||
let (_, p) = self.points.range(..=key).next_back()?;
|
||||
|
||||
// Find the element with the highest end key at this point
|
||||
let highest_item = p
|
||||
.elements
|
||||
.iter()
|
||||
.reduce(|a, b| {
|
||||
// starting with Rust 1.53, could use `std::cmp::min_by_key` here
|
||||
if a.end_key() > b.end_key() {
|
||||
a
|
||||
} else {
|
||||
b
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
Some(Arc::clone(highest_item))
|
||||
}
|
||||
|
||||
/// Iterate over all items with start bound >= 'key'
|
||||
pub fn iter_newer(&self, key: I::Key) -> IntervalIter<I> {
|
||||
IntervalIter {
|
||||
point_iter: self.points.range(key..),
|
||||
elem_iter: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterate over all items
|
||||
pub fn iter(&self) -> IntervalIter<I> {
|
||||
IntervalIter {
|
||||
point_iter: self.points.range(..),
|
||||
elem_iter: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, item: Arc<I>) {
|
||||
let start_key = item.start_key();
|
||||
let end_key = item.end_key();
|
||||
assert!(start_key < end_key);
|
||||
let bounds = start_key..end_key;
|
||||
|
||||
// Find the starting point and walk forward from there
|
||||
let mut found_start_point = false;
|
||||
let iter = self.points.range_mut(bounds);
|
||||
for (point_key, point) in iter {
|
||||
if *point_key == start_key {
|
||||
found_start_point = true;
|
||||
// It is an error to insert the same item to the tree twice.
|
||||
assert!(
|
||||
!point.elements.iter().any(|x| Arc::ptr_eq(x, &item)),
|
||||
"interval is already in the tree"
|
||||
);
|
||||
}
|
||||
point.elements.push(Arc::clone(&item));
|
||||
}
|
||||
if !found_start_point {
|
||||
// Create a new Point for the starting point
|
||||
|
||||
// Look at the previous point, and copy over elements that overlap with this
|
||||
// new point
|
||||
let mut new_elements: Vec<Arc<I>> = Vec::new();
|
||||
if let Some((_, prev_point)) = self.points.range(..start_key).next_back() {
|
||||
let overlapping_prev_elements = prev_point
|
||||
.elements
|
||||
.iter()
|
||||
.filter(|x| x.bounds().contains(&start_key))
|
||||
.cloned();
|
||||
|
||||
new_elements.extend(overlapping_prev_elements);
|
||||
}
|
||||
new_elements.push(item);
|
||||
|
||||
let new_point = Point {
|
||||
elements: new_elements,
|
||||
};
|
||||
self.points.insert(start_key, new_point);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, item: &Arc<I>) {
|
||||
// range search points
|
||||
let start_key = item.start_key();
|
||||
let end_key = item.end_key();
|
||||
let bounds = start_key..end_key;
|
||||
|
||||
let mut points_to_remove: Vec<I::Key> = Vec::new();
|
||||
let mut found_start_point = false;
|
||||
for (point_key, point) in self.points.range_mut(bounds) {
|
||||
if *point_key == start_key {
|
||||
found_start_point = true;
|
||||
}
|
||||
let len_before = point.elements.len();
|
||||
point.elements.retain(|other| !Arc::ptr_eq(other, item));
|
||||
let len_after = point.elements.len();
|
||||
assert_eq!(len_after + 1, len_before);
|
||||
if len_after == 0 {
|
||||
points_to_remove.push(*point_key);
|
||||
}
|
||||
}
|
||||
assert!(found_start_point);
|
||||
|
||||
for k in points_to_remove {
|
||||
self.points.remove(&k).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IntervalIter<'a, I: ?Sized>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
point_iter: std::collections::btree_map::Range<'a, I::Key, Point<I>>,
|
||||
elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc<I>>)>,
|
||||
}
|
||||
|
||||
impl<'a, I> Iterator for IntervalIter<'a, I>
|
||||
where
|
||||
I: IntervalItem + ?Sized,
|
||||
{
|
||||
type Item = Arc<I>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// Iterate over all elements in all the points in 'point_iter'. To avoid
|
||||
// returning the same element twice, we only return each element at its
|
||||
// starting point.
|
||||
loop {
|
||||
// Return next remaining element from the current point
|
||||
if let Some((point_key, elem_iter)) = &mut self.elem_iter {
|
||||
for elem in elem_iter {
|
||||
if elem.start_key() == *point_key {
|
||||
return Some(Arc::clone(elem));
|
||||
}
|
||||
}
|
||||
}
|
||||
// No more elements at this point. Move to next point.
|
||||
if let Some((point_key, point)) = self.point_iter.next() {
|
||||
self.elem_iter = Some((*point_key, point.elements.iter()));
|
||||
continue;
|
||||
} else {
|
||||
// No more points, all done
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: ?Sized> Default for IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
fn default() -> Self {
|
||||
IntervalTree {
|
||||
points: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct MockItem {
|
||||
start_key: u32,
|
||||
end_key: u32,
|
||||
val: String,
|
||||
}
|
||||
impl IntervalItem for MockItem {
|
||||
type Key = u32;
|
||||
|
||||
fn start_key(&self) -> u32 {
|
||||
self.start_key
|
||||
}
|
||||
fn end_key(&self) -> u32 {
|
||||
self.end_key
|
||||
}
|
||||
}
|
||||
impl MockItem {
|
||||
fn new(start_key: u32, end_key: u32) -> Self {
|
||||
MockItem {
|
||||
start_key,
|
||||
end_key,
|
||||
val: format!("{}-{}", start_key, end_key),
|
||||
}
|
||||
}
|
||||
fn new_str(start_key: u32, end_key: u32, val: &str) -> Self {
|
||||
MockItem {
|
||||
start_key,
|
||||
end_key,
|
||||
val: format!("{}-{}: {}", start_key, end_key, val),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl fmt::Display for MockItem {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.val)
|
||||
}
|
||||
}
|
||||
#[rustfmt::skip]
|
||||
fn assert_search(
|
||||
tree: &IntervalTree<MockItem>,
|
||||
key: u32,
|
||||
expected: &[&str],
|
||||
) -> Option<Arc<MockItem>> {
|
||||
if let Some(v) = tree.search(key) {
|
||||
let vstr = v.to_string();
|
||||
|
||||
assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
|
||||
assert!(
|
||||
expected.contains(&vstr.as_str()),
|
||||
"search with {} returned {}, expected one of: {:?}",
|
||||
key, v, expected,
|
||||
);
|
||||
|
||||
Some(v)
|
||||
} else {
|
||||
assert!(
|
||||
expected.is_empty(),
|
||||
"search with {} returned None, expected one of {:?}",
|
||||
key, expected
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_contents(tree: &IntervalTree<MockItem>, expected: &[&str]) {
|
||||
let mut contents: Vec<String> = tree.iter().map(|e| e.to_string()).collect();
|
||||
contents.sort();
|
||||
assert_eq!(contents, expected);
|
||||
}
|
||||
|
||||
fn dump_tree(tree: &IntervalTree<MockItem>) {
|
||||
for (point_key, point) in tree.points.iter() {
|
||||
print!("{}:", point_key);
|
||||
for e in point.elements.iter() {
|
||||
print!(" {}", e);
|
||||
}
|
||||
println!();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_simple() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Simple, non-overlapping ranges.
|
||||
tree.insert(Arc::new(MockItem::new(10, 11)));
|
||||
tree.insert(Arc::new(MockItem::new(11, 12)));
|
||||
tree.insert(Arc::new(MockItem::new(12, 13)));
|
||||
tree.insert(Arc::new(MockItem::new(18, 19)));
|
||||
tree.insert(Arc::new(MockItem::new(17, 18)));
|
||||
tree.insert(Arc::new(MockItem::new(15, 16)));
|
||||
|
||||
assert_search(&tree, 9, &[]);
|
||||
assert_search(&tree, 10, &["10-11"]);
|
||||
assert_search(&tree, 11, &["11-12"]);
|
||||
assert_search(&tree, 12, &["12-13"]);
|
||||
assert_search(&tree, 13, &["12-13"]);
|
||||
assert_search(&tree, 14, &["12-13"]);
|
||||
assert_search(&tree, 15, &["15-16"]);
|
||||
assert_search(&tree, 16, &["15-16"]);
|
||||
assert_search(&tree, 17, &["17-18"]);
|
||||
assert_search(&tree, 18, &["18-19"]);
|
||||
assert_search(&tree, 19, &["18-19"]);
|
||||
assert_search(&tree, 20, &["18-19"]);
|
||||
|
||||
// remove a few entries and search around them again
|
||||
tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry
|
||||
tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle
|
||||
tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry
|
||||
assert_search(&tree, 9, &[]);
|
||||
assert_search(&tree, 10, &[]);
|
||||
assert_search(&tree, 11, &["11-12"]);
|
||||
assert_search(&tree, 12, &["11-12"]);
|
||||
assert_search(&tree, 14, &["11-12"]);
|
||||
assert_search(&tree, 15, &["15-16"]);
|
||||
assert_search(&tree, 17, &["17-18"]);
|
||||
assert_search(&tree, 18, &["17-18"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_overlap() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Overlapping items
|
||||
tree.insert(Arc::new(MockItem::new(22, 24)));
|
||||
tree.insert(Arc::new(MockItem::new(23, 25)));
|
||||
let x24_26 = Arc::new(MockItem::new(24, 26));
|
||||
tree.insert(Arc::clone(&x24_26));
|
||||
let x26_28 = Arc::new(MockItem::new(26, 28));
|
||||
tree.insert(Arc::clone(&x26_28));
|
||||
tree.insert(Arc::new(MockItem::new(25, 27)));
|
||||
|
||||
assert_search(&tree, 22, &["22-24"]);
|
||||
assert_search(&tree, 23, &["22-24", "23-25"]);
|
||||
assert_search(&tree, 24, &["23-25", "24-26"]);
|
||||
assert_search(&tree, 25, &["24-26", "25-27"]);
|
||||
assert_search(&tree, 26, &["25-27", "26-28"]);
|
||||
assert_search(&tree, 27, &["26-28"]);
|
||||
assert_search(&tree, 28, &["26-28"]);
|
||||
assert_search(&tree, 29, &["26-28"]);
|
||||
|
||||
tree.remove(&x24_26);
|
||||
tree.remove(&x26_28);
|
||||
assert_search(&tree, 23, &["22-24", "23-25"]);
|
||||
assert_search(&tree, 24, &["23-25"]);
|
||||
assert_search(&tree, 25, &["25-27"]);
|
||||
assert_search(&tree, 26, &["25-27"]);
|
||||
assert_search(&tree, 27, &["25-27"]);
|
||||
assert_search(&tree, 28, &["25-27"]);
|
||||
assert_search(&tree, 29, &["25-27"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_nested() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Items containing other items
|
||||
tree.insert(Arc::new(MockItem::new(31, 39)));
|
||||
tree.insert(Arc::new(MockItem::new(32, 34)));
|
||||
tree.insert(Arc::new(MockItem::new(33, 35)));
|
||||
tree.insert(Arc::new(MockItem::new(30, 40)));
|
||||
|
||||
assert_search(&tree, 30, &["30-40"]);
|
||||
assert_search(&tree, 31, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 32, &["30-40", "32-34", "31-39"]);
|
||||
assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]);
|
||||
assert_search(&tree, 34, &["30-40", "33-35", "31-39"]);
|
||||
assert_search(&tree, 35, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 36, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 37, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 38, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 39, &["30-40"]);
|
||||
assert_search(&tree, 40, &["30-40"]);
|
||||
assert_search(&tree, 41, &["30-40"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_duplicates() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Duplicate keys
|
||||
let item_a = Arc::new(MockItem::new_str(55, 56, "a"));
|
||||
tree.insert(Arc::clone(&item_a));
|
||||
let item_b = Arc::new(MockItem::new_str(55, 56, "b"));
|
||||
tree.insert(Arc::clone(&item_b));
|
||||
let item_c = Arc::new(MockItem::new_str(55, 56, "c"));
|
||||
tree.insert(Arc::clone(&item_c));
|
||||
let item_d = Arc::new(MockItem::new_str(54, 56, "d"));
|
||||
tree.insert(Arc::clone(&item_d));
|
||||
let item_e = Arc::new(MockItem::new_str(55, 57, "e"));
|
||||
tree.insert(Arc::clone(&item_e));
|
||||
|
||||
dump_tree(&tree);
|
||||
|
||||
assert_search(
|
||||
&tree,
|
||||
55,
|
||||
&["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"],
|
||||
);
|
||||
tree.remove(&item_b);
|
||||
dump_tree(&tree);
|
||||
|
||||
assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]);
|
||||
|
||||
tree.remove(&item_d);
|
||||
dump_tree(&tree);
|
||||
assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_interval_tree_insert_twice() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Inserting the same item twice is not cool
|
||||
let item = Arc::new(MockItem::new(1, 2));
|
||||
tree.insert(Arc::clone(&item));
|
||||
tree.insert(Arc::clone(&item)); // fails assertion
|
||||
}
|
||||
}
|
||||
@@ -9,16 +9,14 @@
|
||||
//! new image and delta layers and corresponding files are written to disk.
|
||||
//!
|
||||
|
||||
use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree};
|
||||
use crate::layered_repository::storage_layer::{Layer, SegmentTag};
|
||||
use crate::layered_repository::InMemoryLayer;
|
||||
use crate::relish::*;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{BTreeMap, BinaryHeap, HashMap};
|
||||
use std::ops::Bound::Included;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::sync::Arc;
|
||||
use zenith_metrics::{register_int_gauge, IntGauge};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -35,68 +33,21 @@ lazy_static! {
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct LayerMap {
|
||||
/// All the layers keyed by segment tag
|
||||
segs: HashMap<SegmentTag, SegEntry>,
|
||||
|
||||
/// All in-memory layers, ordered by 'oldest_pending_lsn' of each layer.
|
||||
/// This allows easy access to the in-memory layer that contains the
|
||||
/// oldest WAL record.
|
||||
open_segs: BinaryHeap<OpenSegEntry>,
|
||||
/// All in-memory layers, ordered by 'oldest_pending_lsn' and generation
|
||||
/// of each layer. This allows easy access to the in-memory layer that
|
||||
/// contains the oldest WAL record.
|
||||
open_layers: BinaryHeap<OpenLayerEntry>,
|
||||
|
||||
/// Generation number, used to distinguish newly inserted entries in the
|
||||
/// binary heap from older entries during checkpoint.
|
||||
current_generation: u64,
|
||||
}
|
||||
|
||||
///
|
||||
/// Per-segment entry in the LayerMap.segs hash map
|
||||
///
|
||||
/// The last layer that is open for writes is always an InMemoryLayer,
|
||||
/// and is kept in a separate field, because there can be only one for
|
||||
/// each segment. The older layers, stored on disk, are kept in a
|
||||
/// BTreeMap keyed by the layer's start LSN.
|
||||
struct SegEntry {
|
||||
pub open: Option<Arc<InMemoryLayer>>,
|
||||
pub historic: BTreeMap<Lsn, Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
/// Entry held LayerMap.open_segs, with boilerplate comparison
|
||||
/// routines to implement a min-heap ordered by 'oldest_pending_lsn'
|
||||
///
|
||||
/// Each entry also carries a generation number. It can be used to distinguish
|
||||
/// entries with the same 'oldest_pending_lsn'.
|
||||
struct OpenSegEntry {
|
||||
pub oldest_pending_lsn: Lsn,
|
||||
pub layer: Arc<InMemoryLayer>,
|
||||
pub generation: u64,
|
||||
}
|
||||
impl Ord for OpenSegEntry {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||
// to get that.
|
||||
other.oldest_pending_lsn.cmp(&self.oldest_pending_lsn)
|
||||
}
|
||||
}
|
||||
impl PartialOrd for OpenSegEntry {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||
// to get that. Entries with identical oldest_pending_lsn are ordered by generation
|
||||
Some(
|
||||
other
|
||||
.oldest_pending_lsn
|
||||
.cmp(&self.oldest_pending_lsn)
|
||||
.then_with(|| other.generation.cmp(&self.generation)),
|
||||
)
|
||||
}
|
||||
}
|
||||
impl PartialEq for OpenSegEntry {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.oldest_pending_lsn.eq(&other.oldest_pending_lsn)
|
||||
}
|
||||
}
|
||||
impl Eq for OpenSegEntry {}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Look up a layer using the given segment tag and LSN. This differs from a
|
||||
@@ -107,23 +58,7 @@ impl LayerMap {
|
||||
pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
let segentry = self.segs.get(tag)?;
|
||||
|
||||
if let Some(open) = &segentry.open {
|
||||
if open.get_start_lsn() <= lsn {
|
||||
let x: Arc<dyn Layer> = Arc::clone(open) as _;
|
||||
return Some(x);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((_k, v)) = segentry
|
||||
.historic
|
||||
.range((Included(Lsn(0)), Included(lsn)))
|
||||
.next_back()
|
||||
{
|
||||
let x: Arc<dyn Layer> = Arc::clone(v) as _;
|
||||
Some(x)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
segentry.get(lsn)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -132,6 +67,7 @@ impl LayerMap {
|
||||
///
|
||||
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
|
||||
let segentry = self.segs.get(tag)?;
|
||||
|
||||
segentry.open.as_ref().map(Arc::clone)
|
||||
}
|
||||
|
||||
@@ -139,38 +75,45 @@ impl LayerMap {
|
||||
/// Insert an open in-memory layer
|
||||
///
|
||||
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
let tag = layer.get_seg_tag();
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
|
||||
if let Some(segentry) = self.segs.get_mut(&tag) {
|
||||
if let Some(_old) = &segentry.open {
|
||||
// FIXME: shouldn't exist, but check
|
||||
}
|
||||
segentry.open = Some(Arc::clone(&layer));
|
||||
} else {
|
||||
let segentry = SegEntry {
|
||||
open: Some(Arc::clone(&layer)),
|
||||
historic: BTreeMap::new(),
|
||||
};
|
||||
self.segs.insert(tag, segentry);
|
||||
}
|
||||
segentry.update_open(Arc::clone(&layer));
|
||||
|
||||
let opensegentry = OpenSegEntry {
|
||||
let oldest_pending_lsn = layer.get_oldest_pending_lsn();
|
||||
|
||||
// After a crash and restart, 'oldest_pending_lsn' of the oldest in-memory
|
||||
// layer becomes the WAL streaming starting point, so it better not point
|
||||
// in the middle of a WAL record.
|
||||
assert!(oldest_pending_lsn.is_aligned());
|
||||
|
||||
// Also add it to the binary heap
|
||||
let open_layer_entry = OpenLayerEntry {
|
||||
oldest_pending_lsn: layer.get_oldest_pending_lsn(),
|
||||
layer,
|
||||
generation: self.current_generation,
|
||||
};
|
||||
self.open_segs.push(opensegentry);
|
||||
self.open_layers.push(open_layer_entry);
|
||||
|
||||
NUM_INMEMORY_LAYERS.inc();
|
||||
}
|
||||
|
||||
/// Remove the oldest in-memory layer
|
||||
pub fn pop_oldest_open(&mut self) {
|
||||
let opensegentry = self.open_segs.pop().unwrap();
|
||||
let segtag = opensegentry.layer.get_seg_tag();
|
||||
// Pop it from the binary heap
|
||||
let oldest_entry = self.open_layers.pop().unwrap();
|
||||
let segtag = oldest_entry.layer.get_seg_tag();
|
||||
|
||||
// Also remove it from the SegEntry of this segment
|
||||
let mut segentry = self.segs.get_mut(&segtag).unwrap();
|
||||
segentry.open = None;
|
||||
if Arc::ptr_eq(segentry.open.as_ref().unwrap(), &oldest_entry.layer) {
|
||||
segentry.open = None;
|
||||
} else {
|
||||
// We could have already updated segentry.open for
|
||||
// dropped (non-writeable) layer. This is fine.
|
||||
assert!(!oldest_entry.layer.is_writeable());
|
||||
assert!(oldest_entry.layer.is_dropped());
|
||||
}
|
||||
|
||||
NUM_INMEMORY_LAYERS.dec();
|
||||
}
|
||||
|
||||
@@ -178,21 +121,9 @@ impl LayerMap {
|
||||
/// Insert an on-disk layer
|
||||
///
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let tag = layer.get_seg_tag();
|
||||
let start_lsn = layer.get_start_lsn();
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
segentry.insert_historic(layer);
|
||||
|
||||
if let Some(segentry) = self.segs.get_mut(&tag) {
|
||||
segentry.historic.insert(start_lsn, layer);
|
||||
} else {
|
||||
let mut historic = BTreeMap::new();
|
||||
historic.insert(start_lsn, layer);
|
||||
|
||||
let segentry = SegEntry {
|
||||
open: None,
|
||||
historic,
|
||||
};
|
||||
self.segs.insert(tag, segentry);
|
||||
}
|
||||
NUM_ONDISK_LAYERS.inc();
|
||||
}
|
||||
|
||||
@@ -201,62 +132,40 @@ impl LayerMap {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer: &dyn Layer) {
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let tag = layer.get_seg_tag();
|
||||
let start_lsn = layer.get_start_lsn();
|
||||
|
||||
if let Some(segentry) = self.segs.get_mut(&tag) {
|
||||
segentry.historic.remove(&start_lsn);
|
||||
segentry.historic.remove(&layer);
|
||||
}
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
|
||||
// List relations that exist at the lsn
|
||||
pub fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>> {
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
// List relations along with a flag that marks if they exist at the given lsn.
|
||||
// spcnode 0 and dbnode 0 have special meanings and mean all tabespaces/databases.
|
||||
// Pass Tag if we're only interested in some relations.
|
||||
pub fn list_relishes(&self, tag: Option<RelTag>, lsn: Lsn) -> Result<HashMap<RelishTag, bool>> {
|
||||
let mut rels: HashMap<RelishTag, bool> = HashMap::new();
|
||||
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
if let RelishTag::Relation(reltag) = seg.rel {
|
||||
if (spcnode == 0 || reltag.spcnode == spcnode)
|
||||
&& (dbnode == 0 || reltag.dbnode == dbnode)
|
||||
{
|
||||
// Add only if it exists at the requested LSN.
|
||||
if let Some(open) = &segentry.open {
|
||||
if open.get_end_lsn() > lsn {
|
||||
rels.insert(reltag);
|
||||
match seg.rel {
|
||||
RelishTag::Relation(reltag) => {
|
||||
if let Some(request_rel) = tag {
|
||||
if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode)
|
||||
&& (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode)
|
||||
{
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
|
||||
rels.insert(seg.rel, exists);
|
||||
}
|
||||
}
|
||||
} else if let Some((_k, _v)) = segentry
|
||||
.historic
|
||||
.range((Included(Lsn(0)), Included(lsn)))
|
||||
.next_back()
|
||||
{
|
||||
rels.insert(reltag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
// List non-relation relishes that exist at the lsn
|
||||
pub fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>> {
|
||||
let mut rels: HashSet<RelishTag> = HashSet::new();
|
||||
|
||||
// Scan the timeline directory to get all rels in this timeline.
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
if let RelishTag::Relation(_) = seg.rel {
|
||||
} else {
|
||||
// Add only if it exists at the requested LSN.
|
||||
if let Some(open) = &segentry.open {
|
||||
if open.get_end_lsn() > lsn {
|
||||
rels.insert(seg.rel);
|
||||
_ => {
|
||||
if tag == None {
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
|
||||
rels.insert(seg.rel, exists);
|
||||
}
|
||||
}
|
||||
} else if let Some((_k, _v)) = segentry
|
||||
.historic
|
||||
.range((Included(Lsn(0)), Included(lsn)))
|
||||
.next_back()
|
||||
{
|
||||
rels.insert(seg.rel);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -269,42 +178,31 @@ impl LayerMap {
|
||||
/// be deleted.
|
||||
pub fn newer_image_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool {
|
||||
if let Some(segentry) = self.segs.get(&seg) {
|
||||
// We only check on-disk layers, because
|
||||
// in-memory layers are not durable
|
||||
for (newer_lsn, layer) in segentry
|
||||
.historic
|
||||
.range((Included(lsn), Included(Lsn(u64::MAX))))
|
||||
{
|
||||
// Ignore layers that depend on an older layer.
|
||||
if layer.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if layer.get_end_lsn() > lsn {
|
||||
trace!(
|
||||
"found later layer for {}, {} {}-{}",
|
||||
seg,
|
||||
lsn,
|
||||
newer_lsn,
|
||||
layer.get_end_lsn()
|
||||
);
|
||||
return true;
|
||||
} else {
|
||||
trace!("found singleton layer for {}, {} {}", seg, lsn, newer_lsn);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
segentry.newer_image_layer_exists(lsn)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
trace!("no later layer found for {}, {}", seg, lsn);
|
||||
false
|
||||
}
|
||||
|
||||
/// Is there any layer for given segment that is alive at the lsn?
|
||||
///
|
||||
/// This is a public wrapper for SegEntry fucntion,
|
||||
/// used for garbage collection, to determine if some alive layer
|
||||
/// exists at the lsn. If so, we shouldn't delete a newer dropped layer
|
||||
/// to avoid incorrectly making it visible.
|
||||
pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
|
||||
Ok(if let Some(segentry) = self.segs.get(&seg) {
|
||||
segentry.exists_at_lsn(lsn)?.unwrap_or(false)
|
||||
} else {
|
||||
false
|
||||
})
|
||||
}
|
||||
|
||||
/// Return the oldest in-memory layer, along with its generation number.
|
||||
pub fn peek_oldest_open(&self) -> Option<(Arc<InMemoryLayer>, u64)> {
|
||||
if let Some(opensegentry) = self.open_segs.peek() {
|
||||
Some((Arc::clone(&opensegentry.layer), opensegentry.generation))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
self.open_layers
|
||||
.peek()
|
||||
.map(|oldest_entry| (Arc::clone(&oldest_entry.layer), oldest_entry.generation))
|
||||
}
|
||||
|
||||
/// Increment the generation number used to stamp open in-memory layers. Layers
|
||||
@@ -317,7 +215,7 @@ impl LayerMap {
|
||||
|
||||
pub fn iter_historic_layers(&self) -> HistoricLayerIter {
|
||||
HistoricLayerIter {
|
||||
segiter: self.segs.iter(),
|
||||
seg_iter: self.segs.iter(),
|
||||
iter: None,
|
||||
}
|
||||
}
|
||||
@@ -331,7 +229,7 @@ impl LayerMap {
|
||||
open.dump()?;
|
||||
}
|
||||
|
||||
for (_, layer) in segentry.historic.iter() {
|
||||
for layer in segentry.historic.iter() {
|
||||
layer.dump()?;
|
||||
}
|
||||
}
|
||||
@@ -340,19 +238,114 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LayerMap {
|
||||
fn default() -> Self {
|
||||
LayerMap {
|
||||
segs: HashMap::new(),
|
||||
open_segs: BinaryHeap::new(),
|
||||
current_generation: 0,
|
||||
}
|
||||
impl IntervalItem for dyn Layer {
|
||||
type Key = Lsn;
|
||||
|
||||
fn start_key(&self) -> Lsn {
|
||||
self.get_start_lsn()
|
||||
}
|
||||
fn end_key(&self) -> Lsn {
|
||||
self.get_end_lsn()
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Per-segment entry in the LayerMap::segs hash map. Holds all the layers
|
||||
/// associated with the segment.
|
||||
///
|
||||
/// The last layer that is open for writes is always an InMemoryLayer,
|
||||
/// and is kept in a separate field, because there can be only one for
|
||||
/// each segment. The older layers, stored on disk, are kept in an
|
||||
/// IntervalTree.
|
||||
#[derive(Default)]
|
||||
struct SegEntry {
|
||||
open: Option<Arc<InMemoryLayer>>,
|
||||
historic: IntervalTree<dyn Layer>,
|
||||
}
|
||||
|
||||
impl SegEntry {
|
||||
/// Does the segment exist at given LSN?
|
||||
/// Return None if object is not found in this SegEntry.
|
||||
fn exists_at_lsn(&self, lsn: Lsn) -> Result<Option<bool>> {
|
||||
if let Some(layer) = self.get(lsn) {
|
||||
Ok(Some(layer.get_seg_exists(lsn)?))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
if let Some(open) = &self.open {
|
||||
if open.get_start_lsn() <= lsn {
|
||||
let x: Arc<dyn Layer> = Arc::clone(open) as _;
|
||||
return Some(x);
|
||||
}
|
||||
}
|
||||
|
||||
self.historic.search(lsn)
|
||||
}
|
||||
|
||||
pub fn newer_image_layer_exists(&self, lsn: Lsn) -> bool {
|
||||
// We only check on-disk layers, because
|
||||
// in-memory layers are not durable
|
||||
|
||||
self.historic
|
||||
.iter_newer(lsn)
|
||||
.any(|layer| !layer.is_incremental())
|
||||
}
|
||||
|
||||
// Set new open layer for a SegEntry.
|
||||
// It's ok to rewrite previous open layer,
|
||||
// but only if it is not writeable anymore.
|
||||
pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
if let Some(prev_open) = &self.open {
|
||||
assert!(!prev_open.is_writeable());
|
||||
}
|
||||
self.open = Some(layer);
|
||||
}
|
||||
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
self.historic.insert(layer);
|
||||
}
|
||||
}
|
||||
|
||||
/// Entry held in LayerMap::open_layers, with boilerplate comparison routines
|
||||
/// to implement a min-heap ordered by 'oldest_pending_lsn' and 'generation'
|
||||
///
|
||||
/// The generation number associated with each entry can be used to distinguish
|
||||
/// recently-added entries (i.e after last call to increment_generation()) from older
|
||||
/// entries with the same 'oldest_pending_lsn'.
|
||||
struct OpenLayerEntry {
|
||||
pub oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
|
||||
pub generation: u64,
|
||||
pub layer: Arc<InMemoryLayer>,
|
||||
}
|
||||
impl Ord for OpenLayerEntry {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||
// to get that. Entries with identical oldest_pending_lsn are ordered by generation
|
||||
other
|
||||
.oldest_pending_lsn
|
||||
.cmp(&self.oldest_pending_lsn)
|
||||
.then_with(|| other.generation.cmp(&self.generation))
|
||||
}
|
||||
}
|
||||
impl PartialOrd for OpenLayerEntry {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl PartialEq for OpenLayerEntry {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
impl Eq for OpenLayerEntry {}
|
||||
|
||||
/// Iterator returned by LayerMap::iter_historic_layers()
|
||||
pub struct HistoricLayerIter<'a> {
|
||||
segiter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
|
||||
iter: Option<std::collections::btree_map::Iter<'a, Lsn, Arc<dyn Layer>>>,
|
||||
seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
|
||||
iter: Option<IntervalIter<'a, dyn Layer>>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for HistoricLayerIter<'a> {
|
||||
@@ -362,11 +355,11 @@ impl<'a> Iterator for HistoricLayerIter<'a> {
|
||||
loop {
|
||||
if let Some(x) = &mut self.iter {
|
||||
if let Some(x) = x.next() {
|
||||
return Some(Arc::clone(&*x.1));
|
||||
return Some(Arc::clone(&x));
|
||||
}
|
||||
}
|
||||
if let Some(seg) = self.segiter.next() {
|
||||
self.iter = Some(seg.1.historic.iter());
|
||||
if let Some((_tag, segentry)) = self.seg_iter.next() {
|
||||
self.iter = Some(segentry.historic.iter());
|
||||
continue;
|
||||
} else {
|
||||
return None;
|
||||
@@ -374,3 +367,78 @@ impl<'a> Iterator for HistoricLayerIter<'a> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::PageServerConf;
|
||||
use std::str::FromStr;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1000,
|
||||
forknum: 0,
|
||||
});
|
||||
|
||||
/// Construct a dummy InMemoryLayer for testing
|
||||
fn dummy_inmem_layer(
|
||||
conf: &'static PageServerConf,
|
||||
segno: u32,
|
||||
start_lsn: Lsn,
|
||||
oldest_pending_lsn: Lsn,
|
||||
) -> Arc<InMemoryLayer> {
|
||||
Arc::new(
|
||||
InMemoryLayer::create(
|
||||
conf,
|
||||
ZTimelineId::from_str("00000000000000000000000000000000").unwrap(),
|
||||
ZTenantId::from_str("00000000000000000000000000000000").unwrap(),
|
||||
SegmentTag {
|
||||
rel: TESTREL_A,
|
||||
segno,
|
||||
},
|
||||
start_lsn,
|
||||
oldest_pending_lsn,
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_open_layers() -> Result<()> {
|
||||
let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer"));
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
|
||||
let mut layers = LayerMap::default();
|
||||
|
||||
let gen1 = layers.increment_generation();
|
||||
layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110)));
|
||||
|
||||
let gen2 = layers.increment_generation();
|
||||
layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100)));
|
||||
|
||||
// A helper function (closure) to pop the next oldest open entry from the layer map,
|
||||
// and assert that it is what we'd expect
|
||||
let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
|
||||
let (l, generation) = layers.peek_oldest_open().unwrap();
|
||||
assert!(l.get_seg_tag().segno == expected_segno);
|
||||
assert!(generation == expected_generation);
|
||||
layers.pop_oldest_open();
|
||||
};
|
||||
|
||||
assert_pop_layer(0, gen1); // 0x100
|
||||
assert_pop_layer(5, gen2); // 0x100
|
||||
assert_pop_layer(3, gen1); // 0x110
|
||||
assert_pop_layer(4, gen2); // 0x110
|
||||
assert_pop_layer(2, gen1); // 0x120
|
||||
assert_pop_layer(1, gen1); // 0x200
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
150
pageserver/src/layered_repository/page_versions.rs
Normal file
150
pageserver/src/layered_repository/page_versions.rs
Normal file
@@ -0,0 +1,150 @@
|
||||
use std::{collections::HashMap, ops::RangeBounds, slice};
|
||||
|
||||
use zenith_utils::{lsn::Lsn, vec_map::VecMap};
|
||||
|
||||
use super::storage_layer::PageVersion;
|
||||
|
||||
const EMPTY_SLICE: &[(Lsn, PageVersion)] = &[];
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct PageVersions(HashMap<u32, VecMap<Lsn, PageVersion>>);
|
||||
|
||||
impl PageVersions {
|
||||
pub fn append_or_update_last(
|
||||
&mut self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
page_version: PageVersion,
|
||||
) -> Option<PageVersion> {
|
||||
let map = self.0.entry(blknum).or_insert_with(VecMap::default);
|
||||
map.append_or_update_last(lsn, page_version).unwrap()
|
||||
}
|
||||
|
||||
/// Get all [`PageVersion`]s in a block
|
||||
pub fn get_block_slice(&self, blknum: u32) -> &[(Lsn, PageVersion)] {
|
||||
self.0
|
||||
.get(&blknum)
|
||||
.map(VecMap::as_slice)
|
||||
.unwrap_or(EMPTY_SLICE)
|
||||
}
|
||||
|
||||
/// Get a range of [`PageVersions`] in a block
|
||||
pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
range: R,
|
||||
) -> &[(Lsn, PageVersion)] {
|
||||
self.0
|
||||
.get(&blknum)
|
||||
.map(|vec_map| vec_map.slice_range(range))
|
||||
.unwrap_or(EMPTY_SLICE)
|
||||
}
|
||||
|
||||
/// Iterate through [`PageVersion`]s in (block, lsn) order.
|
||||
/// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
|
||||
pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
|
||||
let mut ordered_blocks: Vec<u32> = self.0.keys().cloned().collect();
|
||||
ordered_blocks.sort_unstable();
|
||||
|
||||
let slice = ordered_blocks
|
||||
.first()
|
||||
.map(|&blknum| self.get_block_slice(blknum))
|
||||
.unwrap_or(EMPTY_SLICE);
|
||||
|
||||
OrderedPageVersionIter {
|
||||
page_versions: self,
|
||||
ordered_blocks,
|
||||
cur_block_idx: 0,
|
||||
cutoff_lsn,
|
||||
cur_slice_iter: slice.iter(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OrderedPageVersionIter<'a> {
|
||||
page_versions: &'a PageVersions,
|
||||
|
||||
ordered_blocks: Vec<u32>,
|
||||
cur_block_idx: usize,
|
||||
|
||||
cutoff_lsn: Option<Lsn>,
|
||||
|
||||
cur_slice_iter: slice::Iter<'a, (Lsn, PageVersion)>,
|
||||
}
|
||||
|
||||
impl OrderedPageVersionIter<'_> {
|
||||
fn is_lsn_before_cutoff(&self, lsn: &Lsn) -> bool {
|
||||
if let Some(cutoff_lsn) = self.cutoff_lsn.as_ref() {
|
||||
lsn < cutoff_lsn
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for OrderedPageVersionIter<'a> {
|
||||
type Item = (u32, Lsn, &'a PageVersion);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some((lsn, page_version)) = self.cur_slice_iter.next() {
|
||||
if self.is_lsn_before_cutoff(lsn) {
|
||||
let blknum = self.ordered_blocks[self.cur_block_idx];
|
||||
return Some((blknum, *lsn, page_version));
|
||||
}
|
||||
}
|
||||
|
||||
let next_block_idx = self.cur_block_idx + 1;
|
||||
let blknum: u32 = *self.ordered_blocks.get(next_block_idx)?;
|
||||
self.cur_block_idx = next_block_idx;
|
||||
self.cur_slice_iter = self.page_versions.get_block_slice(blknum).iter();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
const EMPTY_PAGE_VERSION: PageVersion = PageVersion {
|
||||
page_image: None,
|
||||
record: None,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_ordered_iter() {
|
||||
let mut page_versions = PageVersions::default();
|
||||
const BLOCKS: u32 = 1000;
|
||||
const LSNS: u64 = 50;
|
||||
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..LSNS {
|
||||
let old = page_versions.append_or_update_last(blknum, Lsn(lsn), EMPTY_PAGE_VERSION);
|
||||
assert!(old.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
let mut iter = page_versions.ordered_page_version_iter(None);
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..LSNS {
|
||||
let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
|
||||
assert_eq!(actual_blknum, blknum);
|
||||
assert_eq!(Lsn(lsn), actual_lsn);
|
||||
}
|
||||
}
|
||||
assert!(iter.next().is_none());
|
||||
assert!(iter.next().is_none()); // should be robust against excessive next() calls
|
||||
|
||||
const CUTOFF_LSN: Lsn = Lsn(30);
|
||||
let mut iter = page_versions.ordered_page_version_iter(Some(CUTOFF_LSN));
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..CUTOFF_LSN.0 {
|
||||
let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
|
||||
assert_eq!(actual_blknum, blknum);
|
||||
assert_eq!(Lsn(lsn), actual_lsn);
|
||||
}
|
||||
}
|
||||
assert!(iter.next().is_none());
|
||||
assert!(iter.next().is_none()); // should be robust against excessive next() calls
|
||||
}
|
||||
}
|
||||
@@ -3,14 +3,12 @@
|
||||
//!
|
||||
|
||||
use crate::relish::RelishTag;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::ZTimelineId;
|
||||
use crate::repository::{PageReconstructData, PageReconstructResult};
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
@@ -21,7 +19,7 @@ pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
|
||||
/// Each relish stored in the repository is divided into fixed-sized "segments",
|
||||
/// with 10 MB of key-space, or 1280 8k pages each.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy)]
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct SegmentTag {
|
||||
pub rel: RelishTag,
|
||||
pub segno: u32,
|
||||
@@ -46,56 +44,6 @@ impl SegmentTag {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Represents a version of a page at a specific LSN. The LSN is the key of the
|
||||
/// entry in the 'page_versions' hash, it is not duplicated here.
|
||||
///
|
||||
/// A page version can be stored as a full page image, or as WAL record that needs
|
||||
/// to be applied over the previous page version to reconstruct this version.
|
||||
///
|
||||
/// It's also possible to have both a WAL record and a page image in the same
|
||||
/// PageVersion. That happens if page version is originally stored as a WAL record
|
||||
/// but it is later reconstructed by a GetPage@LSN request by performing WAL
|
||||
/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
|
||||
/// the WAL record in that case. TODO: That's pretty accidental, not the result
|
||||
/// of any grand design. If we want to keep reconstructed page versions around, we
|
||||
/// probably should have a separate buffer cache so that we could control the
|
||||
/// replacement policy globally. Or if we keep a reconstructed page image, we
|
||||
/// could throw away the WAL record.
|
||||
///
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PageVersion {
|
||||
/// an 8kb page image
|
||||
pub page_image: Option<Bytes>,
|
||||
/// WAL record to get from previous page version to this one.
|
||||
pub record: Option<WALRecord>,
|
||||
}
|
||||
|
||||
///
|
||||
/// Data needed to reconstruct a page version
|
||||
///
|
||||
/// 'page_img' is the old base image of the page to start the WAL replay with.
|
||||
/// It can be None, if the first WAL record initializes the page (will_init)
|
||||
/// 'records' contains the records to apply over the base image.
|
||||
///
|
||||
pub struct PageReconstructData {
|
||||
pub records: Vec<WALRecord>,
|
||||
pub page_img: Option<Bytes>,
|
||||
}
|
||||
|
||||
/// Return value from Layer::get_page_reconstruct_data
|
||||
pub enum PageReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
Complete,
|
||||
/// This layer didn't contain all the required data, the caller should collect
|
||||
/// more data from the returned predecessor layer at the returned LSN.
|
||||
Continue(Lsn, Arc<dyn Layer>),
|
||||
/// This layer didn't contain data needed to reconstruct the page version at
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
Missing(Lsn),
|
||||
}
|
||||
|
||||
///
|
||||
/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
|
||||
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
@@ -105,21 +53,22 @@ pub enum PageReconstructResult {
|
||||
/// in-memory and on-disk layers.
|
||||
///
|
||||
pub trait Layer: Send + Sync {
|
||||
fn get_tenant_id(&self) -> ZTenantId;
|
||||
|
||||
/// Identify the timeline this relish belongs to
|
||||
fn get_timeline_id(&self) -> ZTimelineId;
|
||||
|
||||
/// Identify the relish segment
|
||||
fn get_seg_tag(&self) -> SegmentTag;
|
||||
|
||||
/// Inclusive start bound of the LSN range that this layer hold
|
||||
/// Inclusive start bound of the LSN range that this layer holds
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
/// 'end_lsn' meaning depends on the layer kind:
|
||||
/// - in-memory layer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn)
|
||||
/// - image layer represents snapshot at one LSN, so end_lsn = lsn
|
||||
/// - delta layer has end_lsn
|
||||
/// Exclusive end bound of the LSN range that this layer holds.
|
||||
///
|
||||
/// TODO Is end_lsn always exclusive for all layer kinds?
|
||||
/// - For an open in-memory layer, this is MAX_LSN.
|
||||
/// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
fn get_end_lsn(&self) -> Lsn;
|
||||
|
||||
/// Is the segment represented by this layer dropped by PostgreSQL?
|
||||
@@ -142,8 +91,8 @@ pub trait Layer: Send + Sync {
|
||||
///
|
||||
/// See PageReconstructResult for possible return values. The collected data
|
||||
/// is appended to reconstruct_data; the caller should pass an empty struct
|
||||
/// on first call. If this returns PageReconstructResult::Continue, call
|
||||
/// again on the returned predecessor layer with the same 'reconstruct_data'
|
||||
/// on first call. If this returns PageReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data'
|
||||
/// to collect more data.
|
||||
fn get_page_reconstruct_data(
|
||||
&self,
|
||||
|
||||
@@ -9,18 +9,45 @@ use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||
|
||||
pub mod basebackup;
|
||||
pub mod branches;
|
||||
pub mod buffered_repository;
|
||||
pub mod http;
|
||||
pub mod layered_repository;
|
||||
pub mod logger;
|
||||
pub mod page_service;
|
||||
pub mod relish;
|
||||
pub mod relish_storage;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod tenant_mgr;
|
||||
pub mod toast_store;
|
||||
pub mod waldecoder;
|
||||
pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
|
||||
pub mod defaults {
|
||||
use const_format::formatcp;
|
||||
use std::time::Duration;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
|
||||
// Minimal size of WAL records chain to trigger materialization of the page
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(10);
|
||||
|
||||
pub const DEFAULT_UPLOAD_DISTANCE: u64 = 1024 * 1024 * 1024;
|
||||
pub const DEFAULT_UPLOAD_PERIOD: Duration = Duration::from_secs(2500);
|
||||
|
||||
pub const DEFAULT_RECONSTRUCT_THRESHOLD: u64 = 0;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 1024;
|
||||
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_live_connections_count",
|
||||
@@ -30,11 +57,22 @@ lazy_static! {
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
pub const LOG_FILE_NAME: &str = "pageserver.log";
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PageServerConf {
|
||||
pub daemonize: bool,
|
||||
pub listen_addr: String,
|
||||
pub http_endpoint_addr: String,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
// Flush out an inmemory layer, if it's holding WAL older than this
|
||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||
// page server crashes.
|
||||
pub checkpoint_distance: u64,
|
||||
pub checkpoint_period: Duration,
|
||||
pub upload_period: Duration,
|
||||
pub upload_distance: u64,
|
||||
pub reconstruct_threshold: u64,
|
||||
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
pub superuser: String,
|
||||
@@ -52,6 +90,7 @@ pub struct PageServerConf {
|
||||
pub auth_type: AuthType,
|
||||
|
||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||
pub relish_storage_config: Option<RelishStorageConfig>,
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
@@ -95,10 +134,6 @@ impl PageServerConf {
|
||||
self.timeline_path(timelineid, tenantid).join("ancestor")
|
||||
}
|
||||
|
||||
fn wal_dir_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.timeline_path(timelineid, tenantid).join("wal")
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
@@ -110,4 +145,76 @@ impl PageServerConf {
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn test_repo_dir(test_name: &str) -> PathBuf {
|
||||
PathBuf::from(format!("../tmp_check/test_{}", test_name))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dummy_conf(repo_dir: PathBuf) -> Self {
|
||||
PageServerConf {
|
||||
daemonize: false,
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
checkpoint_period: Duration::from_secs(10),
|
||||
upload_distance: defaults::DEFAULT_UPLOAD_DISTANCE,
|
||||
upload_period: defaults::DEFAULT_UPLOAD_PERIOD,
|
||||
reconstruct_threshold: defaults::DEFAULT_RECONSTRUCT_THRESHOLD,
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
superuser: "zenith_admin".to_string(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: "".into(),
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
relish_storage_config: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// External relish storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RelishStorageConfig {
|
||||
/// Limits the number of concurrent sync operations between pageserver and relish storage.
|
||||
pub max_concurrent_sync: usize,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RelishStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a relish storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum RelishStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored relish data into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all relishes into the root
|
||||
/// of the S3 bucket from the config.
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// "Login" to use when connecting to bucket.
|
||||
/// Can be empty for cases like AWS k8s IAM
|
||||
/// where we can allow certain pods to connect
|
||||
/// to the bucket directly without any credentials.
|
||||
pub access_key_id: Option<String>,
|
||||
/// "Password" to use when connecting to bucket.
|
||||
pub secret_access_key: Option<String>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
use crate::PageServerConf;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use slog::{Drain, FnValue};
|
||||
use std::fs::{File, OpenOptions};
|
||||
|
||||
pub fn init_logging(
|
||||
_conf: &PageServerConf,
|
||||
log_filename: &str,
|
||||
) -> Result<(slog_scope::GlobalLoggerGuard, File)> {
|
||||
// Don't open the same file for output multiple times;
|
||||
// the different fds could overwrite each other's output.
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
|
||||
let logger_file = log_file.try_clone().unwrap();
|
||||
|
||||
let decorator = slog_term::PlainSyncDecorator::new(logger_file);
|
||||
let drain = slog_term::FullFormat::new(decorator).build();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
if record.level().is_at_least(slog::Level::Info) {
|
||||
return true;
|
||||
}
|
||||
false
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(
|
||||
drain,
|
||||
slog::o!(
|
||||
"location" =>
|
||||
FnValue(move |record| {
|
||||
format!("{}, {}:{}",
|
||||
record.module(),
|
||||
record.file(),
|
||||
record.line()
|
||||
)
|
||||
}
|
||||
)
|
||||
),
|
||||
);
|
||||
Ok((slog_scope::set_global_logger(logger), log_file))
|
||||
}
|
||||
@@ -10,10 +10,9 @@
|
||||
// *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{anyhow, bail, ensure, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::net::TcpListener;
|
||||
use std::str;
|
||||
@@ -21,10 +20,12 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::{io, net::TcpStream};
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::auth::{self, JwtAuth};
|
||||
use zenith_utils::auth::{Claims, Scope};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::is_socket_read_timed_out;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::postgres_backend::{self, AuthType};
|
||||
use zenith_utils::pq_proto::{
|
||||
@@ -35,72 +36,110 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
use crate::basebackup;
|
||||
use crate::branches;
|
||||
use crate::relish::*;
|
||||
use crate::repository::Timeline;
|
||||
use crate::tenant_mgr;
|
||||
use crate::walreceiver;
|
||||
use crate::PageServerConf;
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
enum PagestreamFeMessage {
|
||||
Exists(PagestreamRequest),
|
||||
Nblocks(PagestreamRequest),
|
||||
Read(PagestreamRequest),
|
||||
Exists(PagestreamExistsRequest),
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
enum PagestreamBeMessage {
|
||||
Status(PagestreamStatusResponse),
|
||||
Nblocks(PagestreamStatusResponse),
|
||||
Read(PagestreamReadResponse),
|
||||
Exists(PagestreamExistsResponse),
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
GetPage(PagestreamGetPageResponse),
|
||||
Error(PagestreamErrorResponse),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamRequest {
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
relnode: u32,
|
||||
forknum: u8,
|
||||
blkno: u32,
|
||||
struct PagestreamExistsRequest {
|
||||
latest: bool,
|
||||
lsn: Lsn,
|
||||
rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamStatusResponse {
|
||||
ok: bool,
|
||||
struct PagestreamNblocksRequest {
|
||||
latest: bool,
|
||||
lsn: Lsn,
|
||||
rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamGetPageRequest {
|
||||
latest: bool,
|
||||
lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blkno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamExistsResponse {
|
||||
exists: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamNblocksResponse {
|
||||
n_blocks: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamReadResponse {
|
||||
ok: bool,
|
||||
n_blocks: u32,
|
||||
struct PagestreamGetPageResponse {
|
||||
page: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamErrorResponse {
|
||||
message: String,
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// TODO these gets can fail
|
||||
|
||||
let smgr_tag = body.get_u8();
|
||||
let zreq = PagestreamRequest {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
blkno: body.get_u32(),
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
};
|
||||
|
||||
// these correspond to the ZenithMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
match smgr_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(zreq)),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(zreq)),
|
||||
2 => Ok(PagestreamFeMessage::Read(zreq)),
|
||||
_ => Err(anyhow!(
|
||||
"unknown smgr message tag: {},'{:?}'",
|
||||
smgr_tag,
|
||||
body
|
||||
)),
|
||||
let msg_tag = body.get_u8();
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
})),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
})),
|
||||
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
rel: RelTag {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
},
|
||||
blkno: body.get_u32(),
|
||||
})),
|
||||
_ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -110,24 +149,26 @@ impl PagestreamBeMessage {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
match self {
|
||||
Self::Status(resp) => {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(100); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.ok as u8);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(101); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.ok as u8);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::Read(resp) => {
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(102); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.ok as u8);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
bytes.put(&resp.page[..]);
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(103); /* tag from pagestore_client.h */
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -147,17 +188,32 @@ pub fn thread_main(
|
||||
listener: TcpListener,
|
||||
auth_type: AuthType,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let mut join_handles = Vec::new();
|
||||
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
let (socket, peer_addr) = listener.accept()?;
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
let local_auth = auth.clone();
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
|
||||
error!("error: {}", err);
|
||||
}
|
||||
});
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name("serving Page Service thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
|
||||
error!(%err, "page server thread exited with error");
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
join_handles.push(handle);
|
||||
}
|
||||
|
||||
debug!("page_service loop terminated. wait for connections to cancel");
|
||||
for handle in join_handles.into_iter() {
|
||||
handle.join().unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn page_service_conn_main(
|
||||
@@ -176,7 +232,7 @@ fn page_service_conn_main(
|
||||
}
|
||||
|
||||
let mut conn_handler = PageServerHandler::new(conf, auth);
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
@@ -214,122 +270,180 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_controlfile(&self, pgb: &mut PostgresBackend) -> io::Result<()> {
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::ControlFile)?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_pagerequests(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
let repository = tenant_mgr::get_repository_for_tenant(&tenantid)?;
|
||||
let timeline = repository
|
||||
.get_timeline(timelineid)
|
||||
.context(format!("error fetching timeline {}", timelineid))?;
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
/* switch client to COPYBOTH */
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
|
||||
while let Some(message) = pgb.read_message()? {
|
||||
trace!("query({:?}): {:?}", timelineid, message);
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
match pgb.read_message() {
|
||||
Ok(message) => {
|
||||
if let Some(message) = message {
|
||||
trace!("query: {:?}", message);
|
||||
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
_ => continue,
|
||||
};
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
|
||||
let response = match zenith_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let rel = RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
let tag = RelishTag::Relation(rel);
|
||||
let response = match zenith_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_exists"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_rel_exists_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_size"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_nblocks_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_page_at_lsn"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(&*timeline, &req)
|
||||
}),
|
||||
};
|
||||
|
||||
let exist = SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_exists"])
|
||||
.observe_closure_duration(|| {
|
||||
timeline.get_rel_exists(tag, req.lsn).unwrap_or(false)
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough
|
||||
error!("error reading relation or page version: {:#}", e);
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
|
||||
PagestreamBeMessage::Status(PagestreamStatusResponse {
|
||||
ok: exist,
|
||||
n_blocks: 0,
|
||||
})
|
||||
pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let rel = RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
let tag = RelishTag::Relation(rel);
|
||||
|
||||
let n_blocks = SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_size"])
|
||||
.observe_closure_duration(|| {
|
||||
// Return 0 if relation is not found.
|
||||
// This is what postgres smgr expects.
|
||||
timeline
|
||||
.get_relish_size(tag, req.lsn)
|
||||
.unwrap_or(Some(0))
|
||||
.unwrap_or(0)
|
||||
});
|
||||
|
||||
PagestreamBeMessage::Nblocks(PagestreamStatusResponse { ok: true, n_blocks })
|
||||
Err(e) => {
|
||||
if !is_socket_read_timed_out(&e) {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::Read(req) => {
|
||||
let rel = RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
let tag = RelishTag::Relation(rel);
|
||||
|
||||
let read_response = SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_page_at_lsn"])
|
||||
.observe_closure_duration(|| {
|
||||
match timeline.get_page_at_lsn(tag, req.blkno, req.lsn) {
|
||||
Ok(p) => PagestreamReadResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
page: p,
|
||||
},
|
||||
Err(e) => {
|
||||
const ZERO_PAGE: [u8; 8192] = [0; 8192];
|
||||
error!("get_page_at_lsn: {}", e);
|
||||
PagestreamReadResponse {
|
||||
ok: false,
|
||||
n_blocks: 0,
|
||||
page: Bytes::from_static(&ZERO_PAGE),
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
PagestreamBeMessage::Read(read_response)
|
||||
}
|
||||
};
|
||||
|
||||
pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Helper function to handle the LSN from client request.
|
||||
///
|
||||
/// Each GetPage (and Exists and Nblocks) request includes information about
|
||||
/// which version of the page is being requested. The client can request the
|
||||
/// latest version of the page, or the version that's valid at a particular
|
||||
/// LSN. The primary compute node will always request the latest page
|
||||
/// version, while a standby will request a version at the LSN that it's
|
||||
/// currently caught up to.
|
||||
///
|
||||
/// In either case, if the page server hasn't received the WAL up to the
|
||||
/// requested LSN yet, we will wait for it to arrive. The return value is
|
||||
/// the LSN that should be used to look up the page versions.
|
||||
fn wait_or_get_last_lsn(timeline: &dyn Timeline, lsn: Lsn, latest: bool) -> Result<Lsn> {
|
||||
if latest {
|
||||
// Latest page version was requested. If LSN is given, it is a hint
|
||||
// to the page server that there have been no modifications to the
|
||||
// page after that LSN. If we haven't received WAL up to that point,
|
||||
// wait until it arrives.
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
|
||||
// Note: this covers the special case that lsn == Lsn(0). That
|
||||
// special case means "return the latest version whatever it is",
|
||||
// and it's used for bootstrapping purposes, when the page server is
|
||||
// connected directly to the compute node. That is needed because
|
||||
// when you connect to the compute node, to receive the WAL, the
|
||||
// walsender process will do a look up in the pg_authid catalog
|
||||
// table for authentication. That poses a deadlock problem: the
|
||||
// catalog table lookup will send a GetPage request, but the GetPage
|
||||
// request will block in the page server because the recent WAL
|
||||
// hasn't been received yet, and it cannot be received until the
|
||||
// walsender completes the authentication and starts streaming the
|
||||
// WAL.
|
||||
if lsn <= last_record_lsn {
|
||||
Ok(last_record_lsn)
|
||||
} else {
|
||||
timeline.wait_lsn(lsn)?;
|
||||
// Since we waited for 'lsn' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the
|
||||
// last-record LSN can advance immediately after we return
|
||||
// anyway)
|
||||
Ok(lsn)
|
||||
}
|
||||
} else {
|
||||
if lsn == Lsn(0) {
|
||||
bail!("invalid LSN(0) in request");
|
||||
}
|
||||
timeline.wait_lsn(lsn)?;
|
||||
Ok(lsn)
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_get_rel_exists_request(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamExistsRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
let exists = timeline.get_rel_exists(tag, lsn)?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
exists,
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_get_nblocks_request(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamNblocksRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
let n_blocks = timeline.get_relish_size(tag, lsn)?;
|
||||
|
||||
// Return 0 if relation is not found.
|
||||
// This is what postgres smgr expects.
|
||||
let n_blocks = n_blocks.unwrap_or(0);
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
n_blocks,
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_get_page_at_lsn_request(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
|
||||
.entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_basebackup_request(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend,
|
||||
@@ -337,19 +451,20 @@ impl PageServerHandler {
|
||||
lsn: Option<Lsn>,
|
||||
tenantid: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
|
||||
let _enter = span.enter();
|
||||
|
||||
// check that the timeline exists
|
||||
let repository = tenant_mgr::get_repository_for_tenant(&tenantid)?;
|
||||
let timeline = repository
|
||||
.get_timeline(timelineid)
|
||||
.context(format!("error fetching timeline {}", timelineid))?;
|
||||
/* switch client to COPYOUT */
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
// switch client to COPYOUT
|
||||
pgb.write_message(&BeMessage::CopyOutResponse)?;
|
||||
info!("sent CopyOut");
|
||||
|
||||
/* Send a tarball of the latest layer on the timeline */
|
||||
{
|
||||
let mut writer = CopyDataSink { pgb };
|
||||
let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn);
|
||||
let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
|
||||
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
||||
basebackup.send_tarball()?;
|
||||
}
|
||||
pgb.write_message(&BeMessage::CopyDone)?;
|
||||
@@ -421,9 +536,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
}
|
||||
let query_string = std::str::from_utf8(&query_string)?;
|
||||
|
||||
if query_string.starts_with("controlfile") {
|
||||
self.handle_controlfile(pgb)?;
|
||||
} else if query_string.starts_with("pagestream ") {
|
||||
if query_string.starts_with("pagestream ") {
|
||||
let (_, params_raw) = query_string.split_at("pagestream ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
ensure!(
|
||||
@@ -456,11 +569,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
None
|
||||
};
|
||||
|
||||
info!(
|
||||
"got basebackup command. tenantid=\"{}\" timelineid=\"{}\" lsn=\"{:#?}\"",
|
||||
tenantid, timelineid, lsn
|
||||
);
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
@@ -478,11 +586,11 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
self.check_permission(Some(tenantid))?;
|
||||
|
||||
let _enter =
|
||||
info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
let repository = tenant_mgr::get_repository_for_tenant(&tenantid)?;
|
||||
repository
|
||||
.get_timeline(timelineid)
|
||||
.context(format!("error fetching timeline {}", timelineid))?;
|
||||
tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
walreceiver::launch_wal_receiver(self.conf, timelineid, &connstr, tenantid.to_owned());
|
||||
|
||||
@@ -503,6 +611,9 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
self.check_permission(Some(tenantid))?;
|
||||
|
||||
let _enter =
|
||||
info_span!("branch_create", name = %branchname, tenant = %tenantid).entered();
|
||||
|
||||
let branch =
|
||||
branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
|
||||
let branch = serde_json::to_vec(&branch)?;
|
||||
@@ -577,58 +688,26 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.map(|h| h.as_str().parse())
|
||||
.unwrap_or(Ok(self.conf.gc_horizon))?;
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(&tenantid)?;
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"layer_relfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_removed"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"meta_total"),
|
||||
RowDescriptor::int8_col(b"meta_removed"),
|
||||
RowDescriptor::int8_col(b"meta_dropped"),
|
||||
RowDescriptor::int8_col(b"pages_total"),
|
||||
RowDescriptor::int8_col(b"pages_removed"),
|
||||
RowDescriptor::int8_col(b"pages_dropped"),
|
||||
RowDescriptor::int8_col(b"elapsed"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(result.ondisk_relfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_relfiles_needed_by_cutoff
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
result
|
||||
.ondisk_relfiles_needed_by_branches
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
|
||||
Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
|
||||
Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_nonrelfiles_needed_by_cutoff
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
result
|
||||
.ondisk_nonrelfiles_needed_by_branches
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
|
||||
Some(result.meta_total.to_string().as_bytes()),
|
||||
Some(result.meta_removed.to_string().as_bytes()),
|
||||
Some(result.meta_dropped.to_string().as_bytes()),
|
||||
Some(result.pages_total.to_string().as_bytes()),
|
||||
Some(result.pages_removed.to_string().as_bytes()),
|
||||
Some(result.pages_dropped.to_string().as_bytes()),
|
||||
Some(result.elapsed.as_millis().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
87
pageserver/src/relish_storage.rs
Normal file
87
pageserver/src/relish_storage.rs
Normal file
@@ -0,0 +1,87 @@
|
||||
//! Abstractions for the page server to store its relish layer data in the external storage.
|
||||
//!
|
||||
//! Main purpose of this module subtree is to provide a set of abstractions to manage the storage state
|
||||
//! in a way, optimal for page server.
|
||||
//!
|
||||
//! The abstractions hide multiple custom external storage API implementations,
|
||||
//! such as AWS S3, local filesystem, etc., located in the submodules.
|
||||
|
||||
mod local_fs;
|
||||
mod rust_s3;
|
||||
/// A queue-based storage with the background machinery behind it to synchronize
|
||||
/// local page server layer files with external storage.
|
||||
mod synced_storage;
|
||||
|
||||
use std::{path::Path, thread};
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
pub use self::synced_storage::schedule_timeline_upload;
|
||||
use self::{local_fs::LocalFs, rust_s3::RustS3};
|
||||
use crate::{PageServerConf, RelishStorageKind};
|
||||
|
||||
pub fn run_storage_sync_thread(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
match &config.relish_storage_config {
|
||||
Some(relish_storage_config) => {
|
||||
let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
|
||||
match &relish_storage_config.storage {
|
||||
RelishStorageKind::LocalFs(root) => synced_storage::run_storage_sync_thread(
|
||||
config,
|
||||
LocalFs::new(root.clone())?,
|
||||
max_concurrent_sync,
|
||||
),
|
||||
RelishStorageKind::AwsS3(s3_config) => synced_storage::run_storage_sync_thread(
|
||||
config,
|
||||
RustS3::new(s3_config)?,
|
||||
max_concurrent_sync,
|
||||
),
|
||||
}
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RelishStorage: Send + Sync {
|
||||
type RelishStoragePath;
|
||||
|
||||
fn derive_destination(
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<Self::RelishStoragePath>;
|
||||
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;
|
||||
|
||||
async fn download_relish<W: 'static + std::io::Write + Send>(
|
||||
&self,
|
||||
from: &Self::RelishStoragePath,
|
||||
// rust_s3 `get_object_stream` method requires `std::io::BufWriter` for some reason, not the async counterpart
|
||||
// that forces us to consume and return the writer to satisfy the blocking operation async wrapper requirements
|
||||
to: std::io::BufWriter<W>,
|
||||
) -> anyhow::Result<std::io::BufWriter<W>>;
|
||||
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;
|
||||
|
||||
async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
|
||||
&self,
|
||||
from: &mut tokio::io::BufReader<R>,
|
||||
to: &Self::RelishStoragePath,
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
fn strip_workspace_prefix<'a>(
|
||||
page_server_workdir: &'a Path,
|
||||
relish_local_path: &'a Path,
|
||||
) -> anyhow::Result<&'a Path> {
|
||||
relish_local_path
|
||||
.strip_prefix(page_server_workdir)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Unexpected: relish local path '{}' is not relevant to server workdir",
|
||||
relish_local_path.display(),
|
||||
)
|
||||
})
|
||||
}
|
||||
189
pageserver/src/relish_storage/local_fs.rs
Normal file
189
pageserver/src/relish_storage/local_fs.rs
Normal file
@@ -0,0 +1,189 @@
|
||||
//! Local filesystem relish storage.
|
||||
//!
|
||||
//! Page server already stores layer data on the server, when freezing it.
|
||||
//! This storage serves a way to
|
||||
//!
|
||||
//! * test things locally simply
|
||||
//! * allow to compabre both binary sets
|
||||
//! * help validating the relish storage API
|
||||
|
||||
use std::{
|
||||
future::Future,
|
||||
io::Write,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::{fs, io};
|
||||
|
||||
use super::{strip_workspace_prefix, RelishStorage};
|
||||
|
||||
pub struct LocalFs {
|
||||
root: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalFs {
|
||||
/// Atetmpts to create local FS relish storage, also creates the directory provided, if not exists.
|
||||
pub fn new(root: PathBuf) -> anyhow::Result<Self> {
|
||||
if !root.exists() {
|
||||
std::fs::create_dir_all(&root).with_context(|| {
|
||||
format!(
|
||||
"Failed to create all directories in the given root path {}",
|
||||
root.display(),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
Ok(Self { root })
|
||||
}
|
||||
|
||||
fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
|
||||
if path.is_relative() {
|
||||
Ok(self.root.join(path))
|
||||
} else if path.starts_with(&self.root) {
|
||||
Ok(path.to_path_buf())
|
||||
} else {
|
||||
bail!(
|
||||
"Path '{}' does not belong to the current storage",
|
||||
path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RelishStorage for LocalFs {
|
||||
type RelishStoragePath = PathBuf;
|
||||
|
||||
fn derive_destination(
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<Self::RelishStoragePath> {
|
||||
Ok(strip_workspace_prefix(page_server_workdir, relish_local_path)?.to_path_buf())
|
||||
}
|
||||
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
|
||||
Ok(get_all_files(&self.root).await?.into_iter().collect())
|
||||
}
|
||||
|
||||
async fn download_relish<W: 'static + std::io::Write + Send>(
|
||||
&self,
|
||||
from: &Self::RelishStoragePath,
|
||||
mut to: std::io::BufWriter<W>,
|
||||
) -> anyhow::Result<std::io::BufWriter<W>> {
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
let updated_buffer = tokio::task::spawn_blocking(move || {
|
||||
let mut source = std::io::BufReader::new(
|
||||
std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&file_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
std::io::copy(&mut source, &mut to)
|
||||
.context("Failed to download the relish file")?;
|
||||
to.flush().context("Failed to flush the download buffer")?;
|
||||
Ok::<_, anyhow::Error>(to)
|
||||
})
|
||||
.await
|
||||
.context("Failed to spawn a blocking task")??;
|
||||
Ok(updated_buffer)
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(path)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
Ok(tokio::fs::remove_file(file_path).await?)
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish<R: io::AsyncRead + std::marker::Unpin + Send>(
|
||||
&self,
|
||||
from: &mut io::BufReader<R>,
|
||||
to: &Self::RelishStoragePath,
|
||||
) -> anyhow::Result<()> {
|
||||
let target_file_path = self.resolve_in_storage(to)?;
|
||||
create_target_directory(&target_file_path).await?;
|
||||
let mut destination = io::BufWriter::new(
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.open(&target_file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open target fs destination at '{}'",
|
||||
target_file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
|
||||
io::copy_buf(from, &mut destination)
|
||||
.await
|
||||
.context("Failed to upload relish to local storage")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Path> + Send + Sync + 'a,
|
||||
{
|
||||
Box::pin(async move {
|
||||
let directory_path = directory_path.as_ref();
|
||||
if directory_path.exists() {
|
||||
if directory_path.is_dir() {
|
||||
let mut paths = Vec::new();
|
||||
let mut dir_contents = tokio::fs::read_dir(directory_path).await?;
|
||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path = dir_entry.path();
|
||||
if file_type.is_symlink() {
|
||||
log::debug!("{:?} us a symlink, skipping", entry_path)
|
||||
} else if file_type.is_dir() {
|
||||
paths.extend(get_all_files(entry_path).await?.into_iter())
|
||||
} else {
|
||||
paths.push(dir_entry.path());
|
||||
}
|
||||
}
|
||||
Ok(paths)
|
||||
} else {
|
||||
bail!("Path '{}' is not a directory", directory_path.display())
|
||||
}
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
|
||||
let target_dir = match target_file_path.parent() {
|
||||
Some(parent_dir) => parent_dir,
|
||||
None => bail!(
|
||||
"Relish path '{}' has no parent directory",
|
||||
target_file_path.display()
|
||||
),
|
||||
};
|
||||
if !target_dir.exists() {
|
||||
tokio::fs::create_dir_all(target_dir).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
149
pageserver/src/relish_storage/rust_s3.rs
Normal file
149
pageserver/src/relish_storage/rust_s3.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
//! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage.
|
||||
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
use s3::{bucket::Bucket, creds::Credentials, region::Region};
|
||||
|
||||
use crate::{
|
||||
relish_storage::{strip_workspace_prefix, RelishStorage},
|
||||
S3Config,
|
||||
};
|
||||
|
||||
const S3_FILE_SEPARATOR: char = '/';
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct S3ObjectKey(String);
|
||||
|
||||
impl S3ObjectKey {
|
||||
fn key(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS S3 relish storage.
|
||||
pub struct RustS3 {
|
||||
bucket: Bucket,
|
||||
}
|
||||
|
||||
impl RustS3 {
|
||||
/// Creates the relish storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
||||
let region = aws_config
|
||||
.bucket_region
|
||||
.parse::<Region>()
|
||||
.context("Failed to parse the s3 region from config")?;
|
||||
let credentials = Credentials::new(
|
||||
aws_config.access_key_id.as_deref(),
|
||||
aws_config.secret_access_key.as_deref(),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.context("Failed to create the s3 credentials")?;
|
||||
Ok(Self {
|
||||
bucket: Bucket::new_with_path_style(
|
||||
aws_config.bucket_name.as_str(),
|
||||
region,
|
||||
credentials,
|
||||
)
|
||||
.context("Failed to create the s3 bucket")?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RelishStorage for RustS3 {
|
||||
type RelishStoragePath = S3ObjectKey;
|
||||
|
||||
fn derive_destination(
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<Self::RelishStoragePath> {
|
||||
let relative_path = strip_workspace_prefix(page_server_workdir, relish_local_path)?;
|
||||
let mut key = String::new();
|
||||
for segment in relative_path {
|
||||
key.push(S3_FILE_SEPARATOR);
|
||||
key.push_str(&segment.to_string_lossy());
|
||||
}
|
||||
Ok(S3ObjectKey(key))
|
||||
}
|
||||
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
|
||||
let list_response = self
|
||||
.bucket
|
||||
.list(String::new(), None)
|
||||
.await
|
||||
.context("Failed to list s3 objects")?;
|
||||
|
||||
Ok(list_response
|
||||
.into_iter()
|
||||
.flat_map(|response| response.contents)
|
||||
.map(|s3_object| S3ObjectKey(s3_object.key))
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn download_relish<W: 'static + std::io::Write + Send>(
|
||||
&self,
|
||||
from: &Self::RelishStoragePath,
|
||||
mut to: std::io::BufWriter<W>,
|
||||
) -> anyhow::Result<std::io::BufWriter<W>> {
|
||||
let code = self
|
||||
.bucket
|
||||
.get_object_stream(from.key(), &mut to)
|
||||
.await
|
||||
.with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
|
||||
if code != 200 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-200 exit code during downloading object from directory, code: {}",
|
||||
code
|
||||
))
|
||||
} else {
|
||||
tokio::task::spawn_blocking(move || {
|
||||
to.flush().context("Failed to fluch the downoad buffer")?;
|
||||
Ok::<_, anyhow::Error>(to)
|
||||
})
|
||||
.await
|
||||
.context("Failed to joim the download buffer flush task")?
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let (_, code) = self
|
||||
.bucket
|
||||
.delete_object(path.key())
|
||||
.await
|
||||
.with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
|
||||
if code != 204 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-204 exit code during deleting object with key '{}', code: {}",
|
||||
path.key(),
|
||||
code
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
|
||||
&self,
|
||||
from: &mut tokio::io::BufReader<R>,
|
||||
to: &Self::RelishStoragePath,
|
||||
) -> anyhow::Result<()> {
|
||||
let code = self
|
||||
.bucket
|
||||
.put_object_stream(from, to.key())
|
||||
.await
|
||||
.with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
|
||||
if code != 200 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-200 exit code during creating object with key '{}', code: {}",
|
||||
to.key(),
|
||||
code
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
57
pageserver/src/relish_storage/synced_storage.rs
Normal file
57
pageserver/src/relish_storage/synced_storage.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use std::time::Duration;
|
||||
use std::{collections::BinaryHeap, sync::Mutex, thread};
|
||||
|
||||
use crate::tenant_mgr;
|
||||
use crate::{relish_storage::RelishStorage, PageServerConf};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref UPLOAD_QUEUE: Mutex<BinaryHeap<SyncTask>> = Mutex::new(BinaryHeap::new());
|
||||
}
|
||||
|
||||
pub fn schedule_timeline_upload(_local_timeline: ()) {
|
||||
// UPLOAD_QUEUE
|
||||
// .lock()
|
||||
// .unwrap()
|
||||
// .push(SyncTask::Upload(local_timeline))
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum SyncTask {}
|
||||
|
||||
pub fn run_storage_sync_thread<
|
||||
P: std::fmt::Debug,
|
||||
S: 'static + RelishStorage<RelishStoragePath = P>,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
relish_storage: S,
|
||||
max_concurrent_sync: usize,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name("Queue based relish storage sync".to_string())
|
||||
.spawn(move || {
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
|
||||
log::debug!("Upload queue length: {}", queue_accessor.len());
|
||||
let next_task = queue_accessor.pop();
|
||||
drop(queue_accessor);
|
||||
match next_task {
|
||||
Some(task) => runtime.block_on(async {
|
||||
// suppress warnings
|
||||
let _ = (config, task, &relish_storage, max_concurrent_sync);
|
||||
todo!("omitted for brevity")
|
||||
}),
|
||||
None => {
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
log::debug!("Queue based relish storage sync thread shut down");
|
||||
Ok(())
|
||||
})?;
|
||||
Ok(Some(handle))
|
||||
}
|
||||
@@ -3,7 +3,7 @@ use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::ops::AddAssign;
|
||||
use std::ops::{AddAssign, Deref};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::lsn::{Lsn, RecordLsn};
|
||||
@@ -13,15 +13,13 @@ use zenith_utils::zid::ZTimelineId;
|
||||
/// A repository corresponds to one .zenith directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
pub trait Repository: Send + Sync {
|
||||
fn shutdown(&self) -> Result<()>;
|
||||
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
||||
fn create_empty_timeline(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
start_lsn: Lsn,
|
||||
) -> Result<Arc<dyn Timeline>>;
|
||||
fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
||||
@@ -32,62 +30,41 @@ pub trait Repository: Send + Sync {
|
||||
///
|
||||
/// 'timelineid' specifies the timeline to GC, or None for all.
|
||||
/// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
|
||||
/// `compact` parameter is used to force compaction of storage.
|
||||
/// some storage implementation are based on lsm tree and require periodic merge (compaction).
|
||||
/// usually storage implementation determines itself when compaction should be performed.
|
||||
/// but for gc tests it way be useful to force compaction just after completion of gc iteration
|
||||
/// to make sure that all detected garbage is removed.
|
||||
/// so right now `compact` is set to true when gc explicitly requested through page srver api,
|
||||
/// and is st to false in gc threads which infinitely repeats gc iterations in loop.
|
||||
/// `checkpoint_before_gc` parameter is used to force compaction of storage before CG
|
||||
/// to make tests more deterministic.
|
||||
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
|
||||
fn gc_iteration(
|
||||
&self,
|
||||
timelineid: Option<ZTimelineId>,
|
||||
horizon: u64,
|
||||
compact: bool,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult>;
|
||||
|
||||
// TODO get timelines?
|
||||
//fn get_stats(&self) -> RepositoryStats;
|
||||
}
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default)]
|
||||
#[derive(Default, Debug)]
|
||||
pub struct GcResult {
|
||||
pub ondisk_relfiles_total: u64,
|
||||
pub ondisk_relfiles_needed_by_cutoff: u64,
|
||||
pub ondisk_relfiles_needed_by_branches: u64,
|
||||
pub ondisk_relfiles_not_updated: u64,
|
||||
pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped
|
||||
pub meta_removed: u64, // removed versions beyond PITR interval for which new page image exists
|
||||
pub meta_dropped: u64, // removed versions beyond PITR interval of dropped relations
|
||||
pub meta_total: u64, // total number of metaobject version histories
|
||||
|
||||
pub ondisk_nonrelfiles_total: u64,
|
||||
pub ondisk_nonrelfiles_needed_by_cutoff: u64,
|
||||
pub ondisk_nonrelfiles_needed_by_branches: u64,
|
||||
pub ondisk_nonrelfiles_not_updated: u64,
|
||||
pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped
|
||||
pub pages_removed: u64, // removed versions beyond PITR interval for which new page image exists
|
||||
pub pages_dropped: u64, // removed versions beyond PITR interval of dropped relations
|
||||
pub pages_total: u64, // total number of page vaersion histories
|
||||
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.ondisk_relfiles_total += other.ondisk_relfiles_total;
|
||||
self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff;
|
||||
self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches;
|
||||
self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated;
|
||||
self.ondisk_relfiles_removed += other.ondisk_relfiles_removed;
|
||||
self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped;
|
||||
|
||||
self.ondisk_nonrelfiles_total += other.ondisk_nonrelfiles_total;
|
||||
self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff;
|
||||
self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches;
|
||||
self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated;
|
||||
self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed;
|
||||
self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped;
|
||||
|
||||
self.meta_total += other.meta_total;
|
||||
self.meta_removed += other.meta_removed;
|
||||
self.meta_dropped += other.meta_dropped;
|
||||
self.pages_total += other.pages_total;
|
||||
self.pages_removed += other.pages_removed;
|
||||
self.pages_dropped += other.pages_dropped;
|
||||
self.elapsed += other.elapsed;
|
||||
}
|
||||
}
|
||||
@@ -97,11 +74,16 @@ pub trait Timeline: Send + Sync {
|
||||
// Public GET functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn(&self, tag: RelishTag, blknum: u32, lsn: Lsn) -> Result<Bytes>;
|
||||
///
|
||||
/// Wait until WAL has been received and processed up to this LSN.
|
||||
///
|
||||
/// You should call this before any of the other get_* or list_* functions. Calling
|
||||
/// those functions with an LSN that has been processed yet is an error.
|
||||
///
|
||||
fn wait_lsn(&self, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn_nowait(&self, tag: RelishTag, blknum: u32, lsn: Lsn) -> Result<Bytes>;
|
||||
/// Look up given page version.
|
||||
fn get_page_at_lsn(&self, tag: RelishTag, blknum: u32, lsn: Lsn) -> Result<Bytes>;
|
||||
|
||||
/// Get size of a relish
|
||||
fn get_relish_size(&self, tag: RelishTag, lsn: Lsn) -> Result<Option<u32>>;
|
||||
@@ -109,38 +91,29 @@ pub trait Timeline: Send + Sync {
|
||||
/// Does relation exist?
|
||||
fn get_rel_exists(&self, tag: RelishTag, lsn: Lsn) -> Result<bool>;
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;
|
||||
/// Get a list of all existing relations
|
||||
/// Pass RelTag to get relation objects or None to get nonrels.
|
||||
/// Get a list of all existing relations in given tablespace and database.
|
||||
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelishTag>>;
|
||||
|
||||
/// Get a list of non-relational objects
|
||||
/// Get a list of all existing non-relational objects
|
||||
fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
|
||||
|
||||
///
|
||||
/// Export data as delats and image layers between 'start_lsn' to 'end_lsn'. The
|
||||
/// start is inclusive, and end is exclusive.
|
||||
///
|
||||
fn export_timeline(&self, start_lsn: Lsn, end_lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Get the LSN where this branch was created
|
||||
fn get_ancestor_lsn(&self) -> Lsn;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
//
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>;
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// This method is used for marking dropped relations and truncated SLRU files
|
||||
fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Track end of the latest digested WAL record.
|
||||
///
|
||||
/// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
|
||||
/// Previous last record LSN is stored alongside the latest and can be read.
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn);
|
||||
/// Atomically get both last and prev.
|
||||
fn get_last_record_rlsn(&self) -> RecordLsn;
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
@@ -148,25 +121,66 @@ pub trait Timeline: Send + Sync {
|
||||
fn get_prev_record_lsn(&self) -> Lsn;
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
/// Mutate the timeline with a [`TimelineWriter`].
|
||||
fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
|
||||
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self) -> Result<()>;
|
||||
|
||||
/// Retrieve current logical size of the timeline
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors,
|
||||
/// doesnt support TwoPhase relishes yet
|
||||
fn get_current_logical_size(&self) -> usize;
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used in tests to ensure thet incremental and non incremental variants match.
|
||||
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RepositoryStats {
|
||||
pub num_entries: Lsn,
|
||||
pub num_page_images: Lsn,
|
||||
pub num_wal_records: Lsn,
|
||||
pub num_getpage_requests: Lsn,
|
||||
/// Various functions to mutate the timeline.
|
||||
// TODO Currently, Deref is used to allow easy access to read methods from this trait.
|
||||
// This is probably considered a bad practice in Rust and should be fixed eventually,
|
||||
// but will cause large code changes.
|
||||
pub trait TimelineWriter: Deref<Target = dyn Timeline> {
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, lsn: Lsn, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>;
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records
|
||||
fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Track end of the latest digested WAL record.
|
||||
///
|
||||
/// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
|
||||
/// Previous last record LSN is stored alongside the latest and can be read.
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn);
|
||||
|
||||
///
|
||||
/// Complete all delayed commits and advance disk_consistent_lsn
|
||||
///
|
||||
fn checkpoint(&self) -> Result<()>;
|
||||
|
||||
///
|
||||
/// Import data from layer files
|
||||
///
|
||||
fn import_timeline(&self, snapshot_lsn: Lsn) -> Result<()>;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
pub rec: Bytes,
|
||||
// Remember the offset of main_data in rec,
|
||||
@@ -177,27 +191,57 @@ pub struct WALRecord {
|
||||
|
||||
impl WALRecord {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u64(self.lsn.0);
|
||||
buf.put_u8(self.will_init as u8);
|
||||
buf.put_u32(self.main_data_offset);
|
||||
buf.put_u32(self.rec.len() as u32);
|
||||
buf.put_slice(&self.rec[..]);
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> WALRecord {
|
||||
let lsn = Lsn::from(buf.get_u64());
|
||||
let will_init = buf.get_u8() != 0;
|
||||
let main_data_offset = buf.get_u32();
|
||||
let mut dst = vec![0u8; buf.get_u32() as usize];
|
||||
buf.copy_to_slice(&mut dst);
|
||||
let rec_len = buf.get_u32() as usize;
|
||||
let rec = buf.split_to(rec_len);
|
||||
WALRecord {
|
||||
lsn,
|
||||
will_init,
|
||||
rec: Bytes::from(dst),
|
||||
rec,
|
||||
main_data_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum PageVersion {
|
||||
/// an 8kb page image
|
||||
Page(Bytes),
|
||||
/// WAL record to get from previous page version to this one.
|
||||
Wal(WALRecord),
|
||||
}
|
||||
|
||||
///
|
||||
/// Data needed to reconstruct a page version
|
||||
///
|
||||
/// 'page_img' is the old base image of the page to start the WAL replay with.
|
||||
/// It can be None, if the first WAL record initializes the page (will_init)
|
||||
/// 'records' contains the records to apply over the base image.
|
||||
///
|
||||
pub struct PageReconstructData {
|
||||
pub records: Vec<(Lsn, WALRecord)>,
|
||||
pub page_img: Option<Bytes>,
|
||||
}
|
||||
|
||||
/// Return value from Layer::get_page_reconstruct_data
|
||||
pub enum PageReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
Complete,
|
||||
/// This layer didn't contain all the required data, the caller should look up
|
||||
/// the predecessor layer at the returned LSN and collect more data from there.
|
||||
Continue(Lsn),
|
||||
/// This layer didn't contain data needed to reconstruct the page version at
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
Missing(Lsn),
|
||||
}
|
||||
|
||||
///
|
||||
/// Tests that should work the same with any Repository/Timeline implementation.
|
||||
///
|
||||
@@ -205,17 +249,21 @@ impl WALRecord {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::buffered_repository::{BufferedRepository, METADATA_FILE_NAME};
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use hex_literal::hex;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
const TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
const NEW_TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
@@ -240,60 +288,75 @@ mod tests {
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn assert_current_logical_size(timeline: &Arc<dyn Timeline>, lsn: Lsn) {
|
||||
let incremental = timeline.get_current_logical_size();
|
||||
let non_incremental = timeline
|
||||
.get_current_logical_size_non_incremental(lsn)
|
||||
.unwrap();
|
||||
assert_eq!(incremental, non_incremental);
|
||||
}
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
||||
|
||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
struct RepoHarness {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
}
|
||||
|
||||
let conf = PageServerConf {
|
||||
daemonize: false,
|
||||
gc_horizon: 64 * 1024 * 1024,
|
||||
gc_period: Duration::from_secs(10),
|
||||
listen_addr: "127.0.0.1:5430".to_string(),
|
||||
http_endpoint_addr: "127.0.0.1:9898".to_string(),
|
||||
superuser: "zenith_admin".to_string(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: "".into(),
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
};
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
let tenantid = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
|
||||
impl RepoHarness {
|
||||
fn create(test_name: &'static str) -> Result<Self> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let repo = Box::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
|
||||
Ok(repo)
|
||||
Ok(Self { conf, tenant_id })
|
||||
}
|
||||
|
||||
fn load(&self) -> Box<dyn Repository> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
Box::new(BufferedRepository::new(
|
||||
self.conf,
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
false,
|
||||
))
|
||||
}
|
||||
|
||||
fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
|
||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
let repo = RepoHarness::create("test_relsize")?.load();
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
// Create timeline to work on
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0x00))?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
tline.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
|
||||
tline.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
|
||||
writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
|
||||
|
||||
tline.advance_last_record_lsn(Lsn(0x50));
|
||||
writer.advance_last_record_lsn(Lsn(0x50));
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(0x50));
|
||||
|
||||
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
|
||||
@@ -337,8 +400,9 @@ mod tests {
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x60));
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x60));
|
||||
assert_current_logical_size(&tline, Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 2);
|
||||
@@ -359,13 +423,13 @@ mod tests {
|
||||
);
|
||||
|
||||
// Truncate to zero length
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x60), 0)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x60));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 0);
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x68));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0);
|
||||
|
||||
// Extend from 0 to 2 blocks, leaving a gap
|
||||
tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x70));
|
||||
writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x70));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE);
|
||||
assert_eq!(
|
||||
@@ -392,21 +456,161 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Test what happens if we dropped a relation
|
||||
// and then created it again within the same layer.
|
||||
#[test]
|
||||
fn test_drop_extend() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_drop_extend")?.load();
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x20));
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1);
|
||||
|
||||
// Drop relish
|
||||
writer.drop_relish(TESTREL_A, Lsn(0x30))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x30));
|
||||
|
||||
// Check that rel is not visible anymore
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
|
||||
assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none());
|
||||
|
||||
// Extend it again
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x40))?.unwrap(), 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Test what happens if we truncated a relation
|
||||
// so that one of its segments was dropped
|
||||
// and then extended it again within the same layer.
|
||||
#[test]
|
||||
fn test_truncate_extend() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_truncate_extend")?.load();
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
//from storage_layer.rs
|
||||
const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
|
||||
let relsize = RELISH_SEG_SIZE * 2;
|
||||
|
||||
// Create relation with relsize blocks
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
}
|
||||
|
||||
writer.advance_last_record_lsn(Lsn(0x20));
|
||||
|
||||
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
|
||||
assert!(tline.get_relish_size(TESTREL_A, Lsn(0x10))?.is_none());
|
||||
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(),
|
||||
relsize
|
||||
);
|
||||
|
||||
// Check relation content
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, blkno, lsn)?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
|
||||
// Truncate relation so that second segment was dropped
|
||||
// - only leave one page
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1);
|
||||
|
||||
for blkno in 0..1 {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
|
||||
// should still see all blocks with older LSN
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(),
|
||||
relsize
|
||||
);
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
|
||||
// Extend relation again.
|
||||
// Add enough blocks to create second segment
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x80);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
}
|
||||
writer.advance_last_record_lsn(Lsn(0x80));
|
||||
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(),
|
||||
relsize
|
||||
);
|
||||
// Check relation content
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x80);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0x00))?;
|
||||
let repo = RepoHarness::create("test_large_rel")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
let mut lsn = 0x10;
|
||||
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||
lsn += 0x10;
|
||||
tline.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
|
||||
writer.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
|
||||
}
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
@@ -415,29 +619,31 @@ mod tests {
|
||||
|
||||
// Truncate one block
|
||||
lsn += 0x10;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
pg_constants::RELSEG_SIZE
|
||||
);
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
|
||||
// Truncate another block
|
||||
lsn += 0x10;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
pg_constants::RELSEG_SIZE - 1
|
||||
);
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
|
||||
// Truncate to 1500, and then truncate all the way down to 0, one block at a time
|
||||
// This tests the behavior at segment boundaries
|
||||
let mut size: i32 = 3000;
|
||||
while size >= 0 {
|
||||
lsn += 0x10;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
size as u32
|
||||
@@ -445,6 +651,65 @@ mod tests {
|
||||
|
||||
size -= 1;
|
||||
}
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Test list_rels() function, with branches and dropped relations
|
||||
///
|
||||
#[test]
|
||||
fn test_list_rels_drop() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_list_rels_drop")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
const TESTDB: u32 = 111;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
|
||||
writer.advance_last_record_lsn(Lsn(0x30));
|
||||
|
||||
// Check that list_rels() lists it after LSN 2, but no before it
|
||||
assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A));
|
||||
assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&TESTREL_A));
|
||||
assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
|
||||
|
||||
// Create a branch, check that the relation is visible there
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let new_writer = newtline.writer();
|
||||
|
||||
assert!(newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x30))?
|
||||
.contains(&TESTREL_A));
|
||||
|
||||
// Drop it on the branch
|
||||
new_writer.drop_relish(TESTREL_A, Lsn(0x40))?;
|
||||
new_writer.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
drop(new_writer);
|
||||
|
||||
// Check that it's no longer listed on the branch after the point where it was dropped
|
||||
assert!(newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x30))?
|
||||
.contains(&TESTREL_A));
|
||||
assert!(!newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x40))?
|
||||
.contains(&TESTREL_A));
|
||||
|
||||
// Run checkpoint and garbage collection and check that it's still not visible
|
||||
newtline.checkpoint()?;
|
||||
repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;
|
||||
|
||||
assert!(!newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x40))?
|
||||
.contains(&TESTREL_A));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -454,31 +719,32 @@ mod tests {
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = get_test_repo("test_branch")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0x00))?;
|
||||
let repo = RepoHarness::create("test_branch")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_PAGE.clone())?;
|
||||
writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
|
||||
// Create another relation
|
||||
tline.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
|
||||
|
||||
tline.advance_last_record_lsn(Lsn(0x40));
|
||||
writer.advance_last_record_lsn(Lsn(0x40));
|
||||
assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let new_writer = newtline.writer();
|
||||
|
||||
newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
newtline.advance_last_record_lsn(Lsn(0x40));
|
||||
new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
new_writer.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
// Check page contents on both branches
|
||||
assert_eq!(
|
||||
@@ -498,21 +764,103 @@ mod tests {
|
||||
|
||||
assert_eq!(newtline.get_relish_size(TESTREL_B, Lsn(0x40))?.unwrap(), 1);
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_metadata() -> Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
|
||||
|
||||
assert!(metadata_path.is_file());
|
||||
|
||||
let mut metadata_bytes = std::fs::read(&metadata_path)?;
|
||||
assert_eq!(metadata_bytes.len(), 512);
|
||||
metadata_bytes[512 - 4 - 2] ^= 1;
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap();
|
||||
assert!(err.to_string().contains("checksum"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn future_layerfiles() -> Result<()> {
|
||||
const TEST_NAME: &str = "future_layerfiles";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let make_empty_file = |filename: &str| -> std::io::Result<()> {
|
||||
let path = timeline_path.join(filename);
|
||||
|
||||
assert!(!path.exists());
|
||||
std::fs::write(&path, &[])?;
|
||||
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let image_filename = format!("pg_control_0_{:016X}", 8000);
|
||||
let delta_filename = format!("pg_control_0_{:016X}_{:016X}", 8000, 8008);
|
||||
|
||||
make_empty_file(&image_filename)?;
|
||||
make_empty_file(&delta_filename)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
let check_old = |filename: &str, num: u32| {
|
||||
let path = timeline_path.join(filename);
|
||||
assert!(!path.exists());
|
||||
|
||||
let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
|
||||
assert!(backup_path.exists());
|
||||
};
|
||||
|
||||
check_old(&image_filename, 0);
|
||||
check_old(&delta_filename, 0);
|
||||
|
||||
make_empty_file(&image_filename)?;
|
||||
make_empty_file(&delta_filename)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
check_old(&image_filename, 0);
|
||||
check_old(&delta_filename, 0);
|
||||
check_old(&image_filename, 1);
|
||||
check_old(&delta_filename, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager {}
|
||||
struct TestRedoManager;
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {} blk {} to get to {}, with {} and {} records",
|
||||
|
||||
@@ -2,19 +2,17 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! zenith Timeline.
|
||||
//!
|
||||
use log::*;
|
||||
use postgres_ffi::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use std::cmp::min;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::io::Seek;
|
||||
use std::io::SeekFrom;
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use tracing::*;
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::*;
|
||||
@@ -36,9 +34,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
///
|
||||
pub fn import_timeline_from_postgres_datadir(
|
||||
path: &Path,
|
||||
timeline: &dyn Timeline,
|
||||
writer: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
|
||||
// Scan 'global'
|
||||
for direntry in fs::read_dir(path.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
@@ -46,16 +46,10 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
None => continue,
|
||||
|
||||
Some("pg_control") => {
|
||||
import_nonrel_file(timeline, lsn, RelishTag::ControlFile, &direntry.path())?;
|
||||
// Extract checkpoint record from pg_control and store is as separate object
|
||||
let pg_control_bytes =
|
||||
timeline.get_page_at_lsn_nowait(RelishTag::ControlFile, 0, lsn)?;
|
||||
let pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?;
|
||||
pg_control = Some(import_control_file(writer, lsn, &direntry.path())?);
|
||||
}
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
writer,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
@@ -67,7 +61,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
timeline,
|
||||
writer,
|
||||
lsn,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
@@ -94,7 +88,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
writer,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
@@ -106,7 +100,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
timeline,
|
||||
writer,
|
||||
lsn,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
@@ -116,25 +110,36 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(timeline, lsn, SlruKind::Clog, &entry.path())?;
|
||||
import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(timeline, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(timeline, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
|
||||
import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
|
||||
timeline.advance_last_record_lsn(lsn.align());
|
||||
timeline.checkpoint()?;
|
||||
// Import WAL. This is needed even when starting from a shutdown checkpoint, because
|
||||
// this reads the checkpoint record itself, advancing the tip of the timeline to
|
||||
// *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'
|
||||
let pg_control = pg_control.ok_or_else(|| anyhow!("pg_control file not found"))?;
|
||||
import_wal(
|
||||
&path.join("pg_wal"),
|
||||
writer,
|
||||
Lsn(pg_control.checkPointCopy.redo),
|
||||
lsn,
|
||||
&mut pg_control.checkPointCopy.clone(),
|
||||
)?;
|
||||
writer.checkpoint()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -142,12 +147,13 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_relfile(
|
||||
path: &Path,
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
) -> Result<()> {
|
||||
// Does it look like a relation file?
|
||||
trace!("importing rel file {}", path.display());
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
if let Err(e) = p {
|
||||
@@ -175,15 +181,14 @@ fn import_relfile(
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
Err(err) => match err.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
bail!("error reading file {}: {:#}", path.display(), err);
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -200,7 +205,7 @@ fn import_relfile(
|
||||
/// are just slurped into the repository as one blob.
|
||||
///
|
||||
fn import_nonrel_file(
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
tag: RelishTag,
|
||||
path: &Path,
|
||||
@@ -210,22 +215,60 @@ fn import_nonrel_file(
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
info!("importing non-rel file {}", path.display());
|
||||
trace!("importing non-rel file {}", path.display());
|
||||
|
||||
timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Import pg_control file into the repository.
|
||||
///
|
||||
/// The control file is imported as is, but we also extract the checkpoint record
|
||||
/// from it and store it separated.
|
||||
fn import_control_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
path: &Path,
|
||||
) -> Result<ControlFileData> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
trace!("importing control file {}", path.display());
|
||||
|
||||
// Import it as ControlFile
|
||||
timeline.put_page_image(
|
||||
RelishTag::ControlFile,
|
||||
0,
|
||||
lsn,
|
||||
Bytes::copy_from_slice(&buffer[..]),
|
||||
)?;
|
||||
|
||||
// Extract the checkpoint record and import it separately.
|
||||
let pg_control = ControlFileData::decode(&buffer)?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?;
|
||||
|
||||
Ok(pg_control)
|
||||
}
|
||||
|
||||
///
|
||||
/// Import an SLRU segment file
|
||||
///
|
||||
fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Path) -> Result<()> {
|
||||
fn import_slru_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
slru: SlruKind,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
// Does it look like an SLRU file?
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
|
||||
info!("importing slru file {}", path.display());
|
||||
trace!("importing slru file {}", path.display());
|
||||
|
||||
let mut rpageno = 0;
|
||||
loop {
|
||||
@@ -241,15 +284,14 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
Err(err) => match err.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
bail!("error reading file {}: {:#}", path.display(), err);
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -261,24 +303,27 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory
|
||||
/// and load all records >= 'startpoint' into the repository.
|
||||
pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint: Lsn) -> Result<()> {
|
||||
/// Scan PostgreSQL WAL files in given directory and load all records between
|
||||
/// 'startpoint' and 'endpoint' into the repository.
|
||||
fn import_wal(
|
||||
walpath: &Path,
|
||||
timeline: &dyn TimelineWriter,
|
||||
startpoint: Lsn,
|
||||
endpoint: Lsn,
|
||||
checkpoint: &mut CheckPoint,
|
||||
) -> Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn_nowait(RelishTag::Checkpoint, 0, startpoint)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
loop {
|
||||
while last_lsn <= endpoint {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
//Read local file
|
||||
// Read local file
|
||||
let mut path = walpath.join(&filename);
|
||||
|
||||
// It could be as .partial
|
||||
@@ -287,13 +332,7 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
let open_result = File::open(&path);
|
||||
if let Err(e) = &open_result {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let mut file = open_result?;
|
||||
let mut file = File::open(&path)?;
|
||||
|
||||
if offset > 0 {
|
||||
file.seek(SeekFrom::Start(offset as u64))?;
|
||||
@@ -308,36 +347,57 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
loop {
|
||||
let rec = waldecoder.poll_decode();
|
||||
if rec.is_err() {
|
||||
// Assume that an error means we've reached the end of
|
||||
// a partial WAL record. So that's ok.
|
||||
trace!("WAL decoder error {:?}", rec);
|
||||
break;
|
||||
}
|
||||
if let Some((lsn, recdata)) = rec.unwrap() {
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let mut checkpoint_modified = false;
|
||||
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
save_decoded_record(&mut checkpoint, timeline, &decoded, recdata, lsn)?;
|
||||
save_decoded_record(
|
||||
checkpoint,
|
||||
&mut checkpoint_modified,
|
||||
timeline,
|
||||
&decoded,
|
||||
recdata,
|
||||
lsn,
|
||||
)?;
|
||||
last_lsn = lsn;
|
||||
} else {
|
||||
break;
|
||||
|
||||
if checkpoint_modified {
|
||||
let checkpoint_bytes = checkpoint.encode();
|
||||
timeline.put_page_image(
|
||||
RelishTag::Checkpoint,
|
||||
0,
|
||||
last_lsn,
|
||||
checkpoint_bytes,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
timeline.advance_last_record_lsn(last_lsn);
|
||||
nrecords += 1;
|
||||
|
||||
trace!("imported record at {} (end {})", lsn, endpoint);
|
||||
}
|
||||
nrecords += 1;
|
||||
}
|
||||
|
||||
info!("imported {} records up to {}", nrecords, last_lsn);
|
||||
debug!("imported {} records up to {}", nrecords, last_lsn);
|
||||
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
info!("reached end of WAL at {}", last_lsn);
|
||||
let checkpoint_bytes = checkpoint.encode();
|
||||
timeline.put_page_image(RelishTag::Checkpoint, 0, last_lsn, checkpoint_bytes)?;
|
||||
if last_lsn != startpoint {
|
||||
debug!(
|
||||
"reached end of WAL at {}, updating checkpoint info",
|
||||
last_lsn
|
||||
);
|
||||
|
||||
timeline.advance_last_record_lsn(last_lsn);
|
||||
} else {
|
||||
info!("no WAL to import at {}", last_lsn);
|
||||
}
|
||||
|
||||
timeline.advance_last_record_lsn(last_lsn.align());
|
||||
timeline.checkpoint()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -347,13 +407,15 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
|
||||
///
|
||||
pub fn save_decoded_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
decoded: &DecodedWALRecord,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
checkpoint.update_next_xid(decoded.xl_xid);
|
||||
|
||||
if checkpoint.update_next_xid(decoded.xl_xid) {
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
@@ -363,15 +425,38 @@ pub fn save_decoded_record(
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
});
|
||||
if blk.apply_image
|
||||
&& blk.has_image
|
||||
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
|
||||
&& (decoded.xl_info == pg_constants::XLOG_FPI
|
||||
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
||||
{
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize);
|
||||
image.extend_from_slice(&recdata[img_offs..img_offs + img_len]);
|
||||
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
// Compression of WAL is not yet supported
|
||||
assert!((blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0);
|
||||
|
||||
timeline.put_wal_record(tag, blk.blkno, rec)?;
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
|
||||
image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
|
||||
assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
|
||||
timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?;
|
||||
} else {
|
||||
let rec = WALRecord {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
@@ -392,10 +477,14 @@ pub fn save_decoded_record(
|
||||
{
|
||||
let dropdb = XlDropDatabase::decode(&mut buf);
|
||||
|
||||
// To drop the database, we need to drop all the relations in it. Like in
|
||||
// save_xlog_dbase_create(), use the previous record's LSN in the list_rels() call
|
||||
let req_lsn = min(timeline.get_last_record_lsn(), lsn);
|
||||
|
||||
for tablespace_id in dropdb.tablespace_ids {
|
||||
let rels = timeline.list_rels(tablespace_id, dropdb.db_id, lsn)?;
|
||||
let rels = timeline.list_rels(tablespace_id, dropdb.db_id, req_lsn)?;
|
||||
for rel in rels {
|
||||
timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
|
||||
timeline.drop_relish(rel, lsn)?;
|
||||
}
|
||||
trace!(
|
||||
"Drop FileNodeMap {}, {} at lsn {}",
|
||||
@@ -432,7 +521,7 @@ pub fn save_decoded_record(
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
let xlrec = XlClogTruncate::decode(&mut buf);
|
||||
save_clog_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
save_clog_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
@@ -501,10 +590,17 @@ pub fn save_decoded_record(
|
||||
)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
|
||||
save_multixact_create_record(
|
||||
checkpoint,
|
||||
checkpoint_modified,
|
||||
timeline,
|
||||
lsn,
|
||||
&xlrec,
|
||||
decoded,
|
||||
)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||
save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
save_multixact_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
@@ -513,7 +609,10 @@ pub fn save_decoded_record(
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
checkpoint.nextOid = next_oid;
|
||||
if checkpoint.nextOid != next_oid {
|
||||
checkpoint.nextOid = next_oid;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
@@ -529,17 +628,19 @@ pub fn save_decoded_record(
|
||||
);
|
||||
if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
|
||||
checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Now that this record has been handled, let the repository know that
|
||||
// it is up-to-date to this LSN
|
||||
timeline.advance_last_record_lsn(lsn.align());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record.
|
||||
fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> {
|
||||
fn save_xlog_dbase_create(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
rec: &XlCreateDatabase,
|
||||
) -> Result<()> {
|
||||
let db_id = rec.db_id;
|
||||
let tablespace_id = rec.tablespace_id;
|
||||
let src_db_id = rec.src_db_id;
|
||||
@@ -558,37 +659,36 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
|
||||
let mut num_rels_copied = 0;
|
||||
let mut num_blocks_copied = 0;
|
||||
for src_rel in rels {
|
||||
assert_eq!(src_rel.spcnode, src_tablespace_id);
|
||||
assert_eq!(src_rel.dbnode, src_db_id);
|
||||
for rel in rels {
|
||||
if let RelishTag::Relation(src_rel) = rel {
|
||||
assert_eq!(src_rel.spcnode, src_tablespace_id);
|
||||
assert_eq!(src_rel.dbnode, src_db_id);
|
||||
|
||||
let nblocks = timeline
|
||||
.get_relish_size(RelishTag::Relation(src_rel), req_lsn)?
|
||||
.unwrap_or(0);
|
||||
let dst_rel = RelTag {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
relnode: src_rel.relnode,
|
||||
forknum: src_rel.forknum,
|
||||
};
|
||||
let nblocks = timeline.get_relish_size(rel, req_lsn)?.unwrap_or(0);
|
||||
let dst_rel = RelTag {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
relnode: src_rel.relnode,
|
||||
forknum: src_rel.forknum,
|
||||
};
|
||||
|
||||
// Copy content
|
||||
for blknum in 0..nblocks {
|
||||
let content =
|
||||
timeline.get_page_at_lsn_nowait(RelishTag::Relation(src_rel), blknum, req_lsn)?;
|
||||
// Copy content
|
||||
for blknum in 0..nblocks {
|
||||
let content = timeline.get_page_at_lsn(rel, blknum, req_lsn)?;
|
||||
|
||||
debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
|
||||
debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
|
||||
|
||||
timeline.put_page_image(RelishTag::Relation(dst_rel), blknum, lsn, content)?;
|
||||
num_blocks_copied += 1;
|
||||
timeline.put_page_image(RelishTag::Relation(dst_rel), blknum, lsn, content)?;
|
||||
num_blocks_copied += 1;
|
||||
}
|
||||
|
||||
if nblocks == 0 {
|
||||
// make sure we have some trace of the relation, even if it's empty
|
||||
timeline.put_truncation(RelishTag::Relation(dst_rel), lsn, 0)?;
|
||||
}
|
||||
|
||||
num_rels_copied += 1;
|
||||
}
|
||||
|
||||
if nblocks == 0 {
|
||||
// make sure we have some trace of the relation, even if it's empty
|
||||
timeline.put_truncation(RelishTag::Relation(dst_rel), lsn, 0)?;
|
||||
}
|
||||
|
||||
num_rels_copied += 1;
|
||||
}
|
||||
|
||||
// Copy relfilemap
|
||||
@@ -597,7 +697,7 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
for tag in timeline.list_nonrels(req_lsn)? {
|
||||
if let RelishTag::FileNodeMap { spcnode, dbnode } = tag {
|
||||
if spcnode == src_tablespace_id && dbnode == src_db_id {
|
||||
let img = timeline.get_page_at_lsn_nowait(tag, 0, req_lsn)?;
|
||||
let img = timeline.get_page_at_lsn(tag, 0, req_lsn)?;
|
||||
let new_tag = RelishTag::FileNodeMap {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
@@ -617,7 +717,11 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record.
|
||||
///
|
||||
/// This is the same logic as in PostgreSQL's smgr_redo() function.
|
||||
fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> {
|
||||
fn save_xlog_smgr_truncate(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
rec: &XlSmgrTruncate,
|
||||
) -> Result<()> {
|
||||
let spcnode = rec.rnode.spcnode;
|
||||
let dbnode = rec.rnode.dbnode;
|
||||
let relnode = rec.rnode.relnode;
|
||||
@@ -679,7 +783,7 @@ fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTrunca
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
|
||||
///
|
||||
fn save_xact_record(
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
parsed: &XlXactParsedRecord,
|
||||
decoded: &DecodedWALRecord,
|
||||
@@ -690,12 +794,12 @@ fn save_xact_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
@@ -711,6 +815,7 @@ fn save_xact_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
@@ -736,7 +841,8 @@ fn save_xact_record(
|
||||
|
||||
fn save_clog_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlClogTruncate,
|
||||
) -> Result<()> {
|
||||
@@ -754,6 +860,7 @@ fn save_clog_truncate_record(
|
||||
// TODO Figure out if there will be any issues with replica.
|
||||
checkpoint.oldestXid = xlrec.oldest_xid;
|
||||
checkpoint.oldestXidDB = xlrec.oldest_xid_db;
|
||||
*checkpoint_modified = true;
|
||||
|
||||
// TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it
|
||||
|
||||
@@ -796,13 +903,13 @@ fn save_clog_truncate_record(
|
||||
|
||||
fn save_multixact_create_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
@@ -811,6 +918,7 @@ fn save_multixact_create_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
@@ -830,6 +938,7 @@ fn save_multixact_create_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
@@ -852,9 +961,11 @@ fn save_multixact_create_record(
|
||||
}
|
||||
if xlrec.mid >= checkpoint.nextMulti {
|
||||
checkpoint.nextMulti = xlrec.mid + 1;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
|
||||
checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
|
||||
if mbr.xid.wrapping_sub(acc) as i32 > 0 {
|
||||
@@ -864,18 +975,22 @@ fn save_multixact_create_record(
|
||||
}
|
||||
});
|
||||
|
||||
checkpoint.update_next_xid(max_mbr_xid);
|
||||
if checkpoint.update_next_xid(max_mbr_xid) {
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
) -> Result<()> {
|
||||
checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
*checkpoint_modified = true;
|
||||
|
||||
// PerformMembersTruncation
|
||||
let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
|
||||
@@ -909,7 +1024,7 @@ fn save_multixact_truncate_record(
|
||||
}
|
||||
|
||||
fn save_relmap_page(
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlRelmapUpdate,
|
||||
decoded: &DecodedWALRecord,
|
||||
|
||||
@@ -2,73 +2,253 @@
|
||||
//! page server.
|
||||
|
||||
use crate::branches;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::repository::Repository;
|
||||
use crate::buffered_repository::BufferedRepository;
|
||||
use crate::repository::{Repository, Timeline};
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::info;
|
||||
use log::{debug, info};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::thread::JoinHandle;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
lazy_static! {
|
||||
pub static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
|
||||
static ref TENANTS: Mutex<HashMap<ZTenantId, Tenant>> = Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
struct Tenant {
|
||||
state: TenantState,
|
||||
repo: Option<Arc<dyn Repository>>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum TenantState {
|
||||
// This tenant only exists in cloud storage. It cannot be accessed.
|
||||
CloudOnly,
|
||||
// This tenant exists in cloud storage, and we are currently downloading it to local disk.
|
||||
// It cannot be accessed yet, not until it's been fully downloaded to local disk.
|
||||
Downloading,
|
||||
// All data for this tenant is complete on local disk, but we haven't loaded the Repository,
|
||||
// Timeline and Layer structs into memory yet, so it cannot be accessed yet.
|
||||
//Ready,
|
||||
// This tenant exists on local disk, and the layer map has been loaded into memory.
|
||||
// The local disk might have some newer files that don't exist in cloud storage yet.
|
||||
Active,
|
||||
// This tenant exists on local disk, and the layer map has been loaded into memory.
|
||||
// The local disk might have some newer files that don't exist in cloud storage yet.
|
||||
// The tenant cannot be accessed anymore for any reason, but graceful shutdown.
|
||||
//Stopping,
|
||||
}
|
||||
|
||||
impl fmt::Display for TenantState {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
TenantState::CloudOnly => f.write_str("CloudOnly"),
|
||||
TenantState::Downloading => f.write_str("Downloading"),
|
||||
TenantState::Active => f.write_str("Active"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
|
||||
TENANTS.lock().unwrap()
|
||||
}
|
||||
|
||||
struct TenantHandleEntry {
|
||||
checkpointer_handle: Option<JoinHandle<()>>,
|
||||
uploader_handle: Option<JoinHandle<()>>,
|
||||
gc_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
// Logically these handles belong to Repository,
|
||||
// but it's just simpler to store them separately
|
||||
lazy_static! {
|
||||
static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
|
||||
let tenantid =
|
||||
ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
|
||||
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
let tenant = Tenant {
|
||||
state: TenantState::CloudOnly,
|
||||
repo: None,
|
||||
};
|
||||
m.insert(tenantid, tenant);
|
||||
}
|
||||
|
||||
init_repo(conf, tenantid);
|
||||
info!("initialized storage for tenant: {}", &tenantid);
|
||||
m.insert(tenantid, repo);
|
||||
}
|
||||
}
|
||||
|
||||
fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(BufferedRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
true,
|
||||
));
|
||||
|
||||
let checkpointer_handle = BufferedRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
let gc_handle = BufferedRepository::launch_gc_thread(conf, repo.clone());
|
||||
let uploader_handle = BufferedRepository::launch_upload_thread(conf, repo.clone());
|
||||
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
let h = TenantHandleEntry {
|
||||
checkpointer_handle: Some(checkpointer_handle),
|
||||
gc_handle: Some(gc_handle),
|
||||
uploader_handle: Some(uploader_handle),
|
||||
};
|
||||
|
||||
handles.insert(tenant_id, h);
|
||||
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenant_id).unwrap();
|
||||
tenant.repo = Some(repo);
|
||||
tenant.state = TenantState::Active;
|
||||
}
|
||||
|
||||
// TODO kb Currently unused function, will later be used when the relish storage downloads a new layer.
|
||||
// Relevant PR: https://github.com/zenithdb/zenith/pull/686
|
||||
pub fn register_relish_download(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
) {
|
||||
log::info!(
|
||||
"Registering new download, tenant id {}, timeline id: {}",
|
||||
tenant_id,
|
||||
timeline_id
|
||||
);
|
||||
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
let mut tenant = m.get_mut(&tenant_id).unwrap();
|
||||
tenant.state = TenantState::Downloading;
|
||||
match &tenant.repo {
|
||||
Some(repo) => init_timeline(repo.as_ref(), timeline_id),
|
||||
None => {
|
||||
log::info!("Initialize new repo");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// init repo updates Tenant state
|
||||
init_repo(conf, tenant_id);
|
||||
let new_repo = get_repository_for_tenant(tenant_id).unwrap();
|
||||
init_timeline(new_repo.as_ref(), timeline_id);
|
||||
}
|
||||
|
||||
fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
|
||||
match repo.get_timeline(timeline_id) {
|
||||
Ok(_timeline) => log::info!("Successfully initialized timeline {}", timeline_id),
|
||||
Err(e) => log::error!("Failed to init timeline {}, reason: {:#}", timeline_id, e),
|
||||
}
|
||||
}
|
||||
|
||||
// Check this flag in the thread loops to know when to exit
|
||||
pub fn shutdown_requested() -> bool {
|
||||
SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn stop_tenant_threads(tenantid: ZTenantId) {
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
if let Some(h) = handles.get_mut(&tenantid) {
|
||||
h.checkpointer_handle.take().map(JoinHandle::join);
|
||||
debug!("checkpointer for tenant {} has stopped", tenantid);
|
||||
h.uploader_handle.take().map(JoinHandle::join);
|
||||
debug!("uploader for tenant {} has stopped", tenantid);
|
||||
h.gc_handle.take().map(JoinHandle::join);
|
||||
debug!("gc for tenant {} has stopped", tenantid);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shutdown_all_tenants() -> Result<()> {
|
||||
SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
|
||||
|
||||
let tenantids = list_tenantids()?;
|
||||
for tenantid in tenantids {
|
||||
stop_tenant_threads(tenantid);
|
||||
let repo = get_repository_for_tenant(tenantid)?;
|
||||
debug!("shutdown tenant {}", tenantid);
|
||||
repo.shutdown()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn create_repository_for_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<()> {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
// First check that the tenant doesn't exist already
|
||||
if m.get(&tenantid).is_some() {
|
||||
bail!("tenant {} already exists", tenantid);
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
// First check that the tenant doesn't exist already
|
||||
if m.get(&tenantid).is_some() {
|
||||
bail!("tenant {} already exists", tenantid);
|
||||
}
|
||||
let tenant = Tenant {
|
||||
state: TenantState::CloudOnly,
|
||||
repo: None,
|
||||
};
|
||||
m.insert(tenantid, tenant);
|
||||
}
|
||||
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
|
||||
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
|
||||
|
||||
m.insert(tenantid, repo);
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenantid).unwrap();
|
||||
tenant.repo = Some(repo);
|
||||
tenant.state = TenantState::Active;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
|
||||
let o = &mut REPOSITORY.lock().unwrap();
|
||||
o.insert(tenantid, repo);
|
||||
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
|
||||
let m = access_tenants();
|
||||
let tenant = m
|
||||
.get(&tenantid)
|
||||
.ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid));
|
||||
|
||||
match &tenant.unwrap().repo {
|
||||
Some(repo) => Ok(Arc::clone(repo)),
|
||||
None => anyhow::bail!("Repository for tenant {} is not yet valid", tenantid),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_repository_for_tenant(tenantid: &ZTenantId) -> Result<Arc<dyn Repository>> {
|
||||
let o = &REPOSITORY.lock().unwrap();
|
||||
o.get(tenantid)
|
||||
.map(|repo| Arc::clone(repo))
|
||||
.ok_or_else(|| anyhow!("repository not found for tenant name {}", tenantid))
|
||||
pub fn get_timeline_for_tenant(
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<Arc<dyn Timeline>> {
|
||||
get_repository_for_tenant(tenantid)?
|
||||
.get_timeline(timelineid)
|
||||
.with_context(|| format!("cannot fetch timeline {}", timelineid))
|
||||
}
|
||||
|
||||
fn list_tenantids() -> Result<Vec<ZTenantId>> {
|
||||
let m = access_tenants();
|
||||
m.iter()
|
||||
.map(|v| {
|
||||
let (tenantid, _) = v;
|
||||
Ok(*tenantid)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
258
pageserver/src/toast_store.rs
Normal file
258
pageserver/src/toast_store.rs
Normal file
@@ -0,0 +1,258 @@
|
||||
use anyhow::{anyhow, Result};
|
||||
use lz4_flex;
|
||||
use std::convert::TryInto;
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
use std::path::Path;
|
||||
|
||||
use yakv::storage::{Key, Storage, StorageConfig, StorageIterator, Transaction, Value};
|
||||
|
||||
const TOAST_SEGMENT_SIZE: usize = 2 * 1024;
|
||||
const CACHE_SIZE: usize = 32 * 1024; // 256Mb
|
||||
//const CACHE_SIZE: usize = 128 * 1024; // 1Gb
|
||||
|
||||
///
|
||||
/// Toast storage consistof two KV databases: one for storing main index
|
||||
/// and second for storing sliced BLOB (values larger than 2kb).
|
||||
/// BLOBs and main data are stored in different databases to improve
|
||||
/// data locality and reduce key size for TOAST segments.
|
||||
///
|
||||
pub struct ToastStore {
|
||||
db: Storage, // key-value database
|
||||
}
|
||||
|
||||
pub struct ToastIterator<'a> {
|
||||
iter: StorageIterator<'a>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct PageData {
|
||||
data: [u8; 8192],
|
||||
}
|
||||
|
||||
impl PageData {
|
||||
pub fn find_first_zero_bit(&self, offs: usize) -> usize {
|
||||
let bytes = self.data;
|
||||
for i in offs..8192 {
|
||||
if bytes[i] != 0xFFu8 {
|
||||
return i * 8 + bytes[i].trailing_ones() as usize;
|
||||
}
|
||||
}
|
||||
usize::MAX
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ToastIterator<'a> {
|
||||
type Item = Result<(Key, Value)>;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut toast: Option<Vec<u8>> = None;
|
||||
let mut next_segno = 0u16;
|
||||
for elem in &mut self.iter {
|
||||
if let Ok((key, value)) = elem {
|
||||
let key_len = key.len();
|
||||
let n_segments =
|
||||
u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
|
||||
let segno = u16::from_be_bytes(key[key_len - 2..].try_into().unwrap());
|
||||
let key = key[..key_len - 4].to_vec();
|
||||
if n_segments != 0 {
|
||||
// TOAST
|
||||
assert_eq!(segno, next_segno);
|
||||
if next_segno == 0 {
|
||||
toast = Some(Vec::with_capacity(n_segments as usize * TOAST_SEGMENT_SIZE))
|
||||
}
|
||||
toast.as_mut().unwrap().extend_from_slice(&value);
|
||||
next_segno = segno + 1;
|
||||
if next_segno == n_segments {
|
||||
let res = lz4_flex::decompress_size_prepended(&toast.unwrap());
|
||||
return Some(if let Ok(decompressed_data) = res {
|
||||
Ok((key, decompressed_data))
|
||||
} else {
|
||||
Err(anyhow!(res.unwrap_err()))
|
||||
});
|
||||
}
|
||||
} else {
|
||||
return Some(Ok((key, value)));
|
||||
}
|
||||
} else {
|
||||
return Some(elem);
|
||||
}
|
||||
}
|
||||
assert_eq!(next_segno, 0);
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedIterator for ToastIterator<'a> {
|
||||
fn next_back(&mut self) -> Option<Self::Item> {
|
||||
let mut toast: Option<Vec<u8>> = None;
|
||||
let mut next_segno = 0u16;
|
||||
while let Some(elem) = self.iter.next_back() {
|
||||
if let Ok((key, value)) = elem {
|
||||
assert!(!value.is_empty());
|
||||
let key_len = key.len();
|
||||
let n_segments =
|
||||
u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
|
||||
let segno = u16::from_be_bytes(key[key_len - 2..].try_into().unwrap());
|
||||
let key = key[..key_len - 4].to_vec();
|
||||
if n_segments != 0 {
|
||||
// TOAST
|
||||
assert!(segno + 1 == next_segno || next_segno == 0);
|
||||
if next_segno == 0 {
|
||||
let len = (n_segments - 1) as usize * TOAST_SEGMENT_SIZE + value.len();
|
||||
let mut vec = vec![0u8; len];
|
||||
vec[len - value.len()..].copy_from_slice(&value);
|
||||
toast = Some(vec);
|
||||
} else {
|
||||
toast.as_mut().unwrap()[segno as usize * TOAST_SEGMENT_SIZE
|
||||
..(segno + 1) as usize * TOAST_SEGMENT_SIZE]
|
||||
.copy_from_slice(&value);
|
||||
}
|
||||
next_segno = segno;
|
||||
if next_segno == 0 {
|
||||
let toast = toast.unwrap();
|
||||
assert!(!toast.is_empty());
|
||||
let res = lz4_flex::decompress_size_prepended(&toast);
|
||||
return Some(if let Ok(decompressed_data) = res {
|
||||
Ok((key, decompressed_data))
|
||||
} else {
|
||||
Err(anyhow!(res.unwrap_err()))
|
||||
});
|
||||
}
|
||||
} else {
|
||||
return Some(Ok((key, value)));
|
||||
}
|
||||
} else {
|
||||
return Some(elem);
|
||||
}
|
||||
}
|
||||
assert_eq!(next_segno, 0);
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// FIXME-KK: not using WAL now. Implement asynchronous or delayed commit.
|
||||
//
|
||||
impl ToastStore {
|
||||
pub fn new(path: &Path) -> Result<ToastStore> {
|
||||
Ok(ToastStore {
|
||||
db: Storage::open(
|
||||
&path.join("pageserver.db"),
|
||||
StorageConfig {
|
||||
cache_size: CACHE_SIZE,
|
||||
nosync: false,
|
||||
mursiw: true,
|
||||
},
|
||||
)?,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn put(&self, key: Key, value: Value) -> Result<()> {
|
||||
let mut tx = self.db.start_transaction();
|
||||
self.tx_remove(&mut tx, &key)?;
|
||||
let value_len = value.len();
|
||||
let mut key = key;
|
||||
if value_len >= TOAST_SEGMENT_SIZE {
|
||||
let compressed_data = lz4_flex::compress_prepend_size(&value);
|
||||
let compressed_data_len = compressed_data.len();
|
||||
let mut offs: usize = 0;
|
||||
let mut segno = 0u16;
|
||||
let n_segments =
|
||||
((compressed_data_len + TOAST_SEGMENT_SIZE - 1) / TOAST_SEGMENT_SIZE) as u16;
|
||||
assert!(n_segments != 0);
|
||||
key.extend_from_slice(&n_segments.to_be_bytes());
|
||||
key.extend_from_slice(&[0u8; 2]);
|
||||
let key_len = key.len();
|
||||
while offs + TOAST_SEGMENT_SIZE < compressed_data_len {
|
||||
key[key_len - 2..].copy_from_slice(&segno.to_be_bytes());
|
||||
tx.put(
|
||||
&key,
|
||||
&compressed_data[offs..offs + TOAST_SEGMENT_SIZE].to_vec(),
|
||||
)?;
|
||||
offs += TOAST_SEGMENT_SIZE;
|
||||
segno += 1;
|
||||
}
|
||||
key[key_len - 2..].copy_from_slice(&segno.to_be_bytes());
|
||||
tx.put(&key, &compressed_data[offs..].to_vec())?;
|
||||
} else {
|
||||
key.extend_from_slice(&[0u8; 4]);
|
||||
tx.put(&key, &value)?;
|
||||
}
|
||||
tx.delay()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn commit(&mut self) -> Result<()> {
|
||||
let tx = self.db.start_transaction();
|
||||
tx.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> ToastIterator<'_> {
|
||||
self.range(..)
|
||||
}
|
||||
|
||||
pub fn range<R: RangeBounds<Key>>(&self, range: R) -> ToastIterator<'_> {
|
||||
let from = match range.start_bound() {
|
||||
Bound::Included(key) => {
|
||||
let mut key = key.clone();
|
||||
key.extend_from_slice(&[0u8; 4]);
|
||||
Bound::Included(key)
|
||||
}
|
||||
Bound::Excluded(key) => {
|
||||
let mut key = key.clone();
|
||||
key.extend_from_slice(&[0u8; 4]);
|
||||
Bound::Excluded(key)
|
||||
}
|
||||
_ => Bound::Unbounded,
|
||||
};
|
||||
let till = match range.end_bound() {
|
||||
Bound::Included(key) => {
|
||||
let mut key = key.clone();
|
||||
key.extend_from_slice(&[0xFFu8; 4]);
|
||||
Bound::Included(key)
|
||||
}
|
||||
Bound::Excluded(key) => {
|
||||
let mut key = key.clone();
|
||||
key.extend_from_slice(&[0xFFu8; 4]);
|
||||
Bound::Excluded(key)
|
||||
}
|
||||
_ => Bound::Unbounded,
|
||||
};
|
||||
ToastIterator {
|
||||
iter: self.db.range((from, till)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&self, key: Key) -> Result<()> {
|
||||
let mut tx = self.db.start_transaction();
|
||||
self.tx_remove(&mut tx, &key)?;
|
||||
tx.delay()
|
||||
}
|
||||
|
||||
pub fn tx_remove(&self, tx: &mut Transaction, key: &[u8]) -> Result<()> {
|
||||
let mut min_key = key.to_vec();
|
||||
let mut max_key = key.to_vec();
|
||||
min_key.extend_from_slice(&[0u8; 4]);
|
||||
max_key.extend_from_slice(&[0xFFu8; 4]);
|
||||
let mut iter = tx.range(&min_key..&max_key);
|
||||
if let Some(entry) = iter.next() {
|
||||
let mut key = entry?.0;
|
||||
let key_len = key.len();
|
||||
let n_segments = u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
|
||||
if n_segments != 0 {
|
||||
// TOAST
|
||||
for i in 0..n_segments {
|
||||
key[key_len - 2..].copy_from_slice(&i.to_be_bytes());
|
||||
tx.remove(&key)?;
|
||||
}
|
||||
} else {
|
||||
tx.remove(&key)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
self.db.get_database_info().db_used
|
||||
}
|
||||
}
|
||||
@@ -54,6 +54,11 @@ impl WalStreamDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
// The latest LSN position fed to the decoder.
|
||||
pub fn available(&self) -> Lsn {
|
||||
self.lsn + self.inputbuf.remaining() as u64
|
||||
}
|
||||
|
||||
pub fn feed_bytes(&mut self, buf: &[u8]) {
|
||||
self.inputbuf.extend_from_slice(buf);
|
||||
}
|
||||
@@ -67,6 +72,10 @@ impl WalStreamDecoder {
|
||||
/// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
|
||||
///
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
|
||||
let recordbuf;
|
||||
|
||||
// Run state machine that validates page headers, and reassembles records
|
||||
// that cross page boundaries.
|
||||
loop {
|
||||
// parse and verify page boundaries as we go
|
||||
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
||||
@@ -115,29 +124,41 @@ impl WalStreamDecoder {
|
||||
self.lsn += self.padlen as u64;
|
||||
self.padlen = 0;
|
||||
} else if self.contlen == 0 {
|
||||
// need to have at least the xl_tot_len field
|
||||
assert!(self.recordbuf.is_empty());
|
||||
|
||||
// need to have at least the xl_tot_len field
|
||||
if self.inputbuf.remaining() < 4 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// read xl_tot_len FIXME: assumes little-endian
|
||||
// peek xl_tot_len at the beginning of the record.
|
||||
// FIXME: assumes little-endian
|
||||
self.startlsn = self.lsn;
|
||||
let xl_tot_len = self.inputbuf.get_u32_le();
|
||||
let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
|
||||
if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
|
||||
return Err(WalDecodeError {
|
||||
msg: format!("invalid xl_tot_len {}", xl_tot_len),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
self.lsn += 4;
|
||||
|
||||
self.recordbuf.clear();
|
||||
self.recordbuf.reserve(xl_tot_len as usize);
|
||||
self.recordbuf.put_u32_le(xl_tot_len);
|
||||
|
||||
self.contlen = xl_tot_len - 4;
|
||||
continue;
|
||||
// Fast path for the common case that the whole record fits on the page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
|
||||
// Take the record from the 'inputbuf', and validate it.
|
||||
recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
|
||||
self.lsn += xl_tot_len as u64;
|
||||
break;
|
||||
} else {
|
||||
// Need to assemble the record from pieces. Remember the size of the
|
||||
// record, and loop back. On next iteration, we will reach the 'else'
|
||||
// branch below, and copy the part of the record that was on this page
|
||||
// to 'recordbuf'. Subsequent iterations will skip page headers, and
|
||||
// append the continuations from the next pages to 'recordbuf'.
|
||||
self.recordbuf.reserve(xl_tot_len as usize);
|
||||
self.contlen = xl_tot_len;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// we're continuing a record, possibly from previous page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
@@ -154,45 +175,42 @@ impl WalStreamDecoder {
|
||||
self.contlen -= n as u32;
|
||||
|
||||
if self.contlen == 0 {
|
||||
let recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new());
|
||||
|
||||
let recordbuf = recordbuf.freeze();
|
||||
let mut buf = recordbuf.clone();
|
||||
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
let mut crc = crc32c_append(0, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
|
||||
crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
|
||||
if crc != xlogrec.xl_crc {
|
||||
return Err(WalDecodeError {
|
||||
msg: "WAL record crc mismatch".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.padlen =
|
||||
self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
}
|
||||
|
||||
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
|
||||
// and WalReceiver integration. Since this code is used both for WalReceiver and
|
||||
// initial WAL import let's force alignment right here.
|
||||
let result = (self.lsn.align(), recordbuf);
|
||||
return Ok(Some(result));
|
||||
// The record is now complete.
|
||||
recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// check record boundaries
|
||||
|
||||
// deal with continuation records
|
||||
// We now have a record in the 'recordbuf' local variable.
|
||||
let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
|
||||
|
||||
// deal with xlog_switch records
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
|
||||
crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
|
||||
if crc != xlogrec.xl_crc {
|
||||
return Err(WalDecodeError {
|
||||
msg: "WAL record crc mismatch".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
}
|
||||
|
||||
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
|
||||
// and WalReceiver integration. Since this code is used both for WalReceiver and
|
||||
// initial WAL import let's force alignment right here.
|
||||
let result = (self.lsn.align(), recordbuf);
|
||||
Ok(Some(result))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,17 +229,18 @@ pub struct DecodedBkpBlock {
|
||||
pub blkno: u32,
|
||||
|
||||
/* copy of the fork_flags field from the XLogRecordBlockHeader */
|
||||
flags: u8,
|
||||
pub flags: u8,
|
||||
|
||||
/* Information on full-page image, if any */
|
||||
has_image: bool, /* has image, even for consistency checking */
|
||||
pub has_image: bool, /* has image, even for consistency checking */
|
||||
pub apply_image: bool, /* has image that should be restored */
|
||||
pub will_init: bool, /* record doesn't need previous page version to apply */
|
||||
//char *bkp_image;
|
||||
hole_offset: u16,
|
||||
hole_length: u16,
|
||||
bimg_len: u16,
|
||||
bimg_info: u8,
|
||||
pub hole_offset: u16,
|
||||
pub hole_length: u16,
|
||||
pub bimg_offset: u32,
|
||||
pub bimg_len: u16,
|
||||
pub bimg_info: u8,
|
||||
|
||||
/* Buffer holding the rmgr-specific data associated with this block */
|
||||
has_data: bool,
|
||||
@@ -841,8 +860,19 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
}
|
||||
|
||||
// 3. Decode blocks.
|
||||
let mut ptr = record.len() - buf.remaining();
|
||||
for blk in blocks.iter_mut() {
|
||||
if blk.has_image {
|
||||
blk.bimg_offset = ptr as u32;
|
||||
ptr += blk.bimg_len as usize;
|
||||
}
|
||||
if blk.has_data {
|
||||
ptr += blk.data_len as usize;
|
||||
}
|
||||
}
|
||||
// We don't need them, so just skip blocks_total_len bytes
|
||||
buf.advance(blocks_total_len as usize);
|
||||
assert_eq!(ptr, record.len() - buf.remaining());
|
||||
|
||||
let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;
|
||||
|
||||
|
||||
@@ -10,26 +10,24 @@ use crate::restore_local_repo;
|
||||
use crate::tenant_mgr;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{Error, Result};
|
||||
use anyhow::{bail, Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
use postgres::replication::ReplicationIter;
|
||||
use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::cmp::{max, min};
|
||||
use std::cell::Cell;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use std::thread::JoinHandle;
|
||||
use std::thread_local;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use tracing::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
@@ -39,6 +37,7 @@ use zenith_utils::zid::ZTimelineId;
|
||||
//
|
||||
struct WalReceiverEntry {
|
||||
wal_producer_connstr: String,
|
||||
wal_receiver_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -46,6 +45,26 @@ lazy_static! {
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
// Boolean that is true only for WAL receiver threads
|
||||
//
|
||||
// This is used in `wait_lsn` to guard against usage that might lead to a deadlock.
|
||||
pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
|
||||
}
|
||||
|
||||
// Wait for walreceiver to stop
|
||||
// Now it stops when pageserver shutdown is requested.
|
||||
// In future we can make this more granular and send shutdown signals
|
||||
// per tenant/timeline to cancel inactive walreceivers.
|
||||
// TODO deal with blocking pg connections
|
||||
pub fn stop_wal_receiver(timelineid: ZTimelineId) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
if let Some(r) = receivers.get_mut(&timelineid) {
|
||||
r.wal_receiver_handle.take();
|
||||
// r.wal_receiver_handle.take().map(JoinHandle::join);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -60,18 +79,19 @@ pub fn launch_wal_receiver(
|
||||
receiver.wal_producer_connstr = wal_producer_connstr.into();
|
||||
}
|
||||
None => {
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Also launch a new thread to handle this connection
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
let wal_receiver_handle = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
thread_main(conf, timelineid, &tenantid);
|
||||
IS_WAL_RECEIVER.with(|c| c.set(true));
|
||||
thread_main(conf, timelineid, tenantid);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
wal_receiver_handle: Some(wal_receiver_handle),
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -90,17 +110,15 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
|
||||
//
|
||||
// This is the entry point for the WAL receiver thread.
|
||||
//
|
||||
fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: &ZTenantId) {
|
||||
info!(
|
||||
"WAL receiver thread started for timeline : '{}'",
|
||||
timelineid
|
||||
);
|
||||
fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
|
||||
let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
info!("WAL receiver thread started");
|
||||
|
||||
//
|
||||
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
|
||||
// and start streaming WAL from it. If the connection is lost, keep retrying.
|
||||
//
|
||||
loop {
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
|
||||
@@ -114,13 +132,14 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
|
||||
sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
debug!("WAL streaming shut down");
|
||||
}
|
||||
|
||||
fn walreceiver_main(
|
||||
conf: &PageServerConf,
|
||||
_conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
tenantid: &ZTenantId,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<(), Error> {
|
||||
// Connect to the database in replication mode.
|
||||
info!("connecting to {:?}", wal_producer_connstr);
|
||||
@@ -146,8 +165,7 @@ fn walreceiver_main(
|
||||
let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
|
||||
let mut caught_up = false;
|
||||
|
||||
let repository = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
let timeline = repository.get_timeline(timelineid).unwrap();
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
//
|
||||
// Start streaming the WAL, from where we left off previously.
|
||||
@@ -158,15 +176,15 @@ fn walreceiver_main(
|
||||
let mut startpoint = last_rec_lsn;
|
||||
|
||||
if startpoint == Lsn(0) {
|
||||
error!("No previous WAL position");
|
||||
bail!("No previous WAL position");
|
||||
}
|
||||
|
||||
// There might be some padding after the last full record, skip it.
|
||||
startpoint += startpoint.calc_padding(8u32);
|
||||
|
||||
info!(
|
||||
"last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, timelineid, end_of_wal
|
||||
"last_record_lsn {} starting replication from {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, end_of_wal
|
||||
);
|
||||
|
||||
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
|
||||
@@ -176,7 +194,7 @@ fn walreceiver_main(
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn_nowait(RelishTag::Checkpoint, 0, startpoint)?;
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn(RelishTag::Checkpoint, 0, startpoint)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
|
||||
@@ -188,78 +206,49 @@ fn walreceiver_main(
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
let prev_last_rec_lsn = last_rec_lsn;
|
||||
|
||||
write_wal_file(
|
||||
conf,
|
||||
startlsn,
|
||||
&timelineid,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
data,
|
||||
tenantid,
|
||||
)?;
|
||||
|
||||
trace!("received XLogData between {} and {}", startlsn, endlsn);
|
||||
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// Save old checkpoint value to compare with it after decoding WAL record
|
||||
let old_checkpoint_bytes = checkpoint.encode();
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
let _enter = info_span!("processing record", lsn = %lsn).entered();
|
||||
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hittind a deadlock.
|
||||
assert!(lsn.is_aligned());
|
||||
|
||||
let writer = timeline.writer();
|
||||
|
||||
let mut checkpoint_modified = false;
|
||||
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
restore_local_repo::save_decoded_record(
|
||||
&mut checkpoint,
|
||||
&*timeline,
|
||||
&mut checkpoint_modified,
|
||||
writer.as_ref(),
|
||||
&decoded,
|
||||
recdata,
|
||||
lsn,
|
||||
)?;
|
||||
last_rec_lsn = lsn;
|
||||
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
// Check if checkpoint data was updated by save_decoded_record
|
||||
if new_checkpoint_bytes != old_checkpoint_bytes {
|
||||
timeline.put_page_image(
|
||||
if checkpoint_modified {
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
|
||||
writer.put_page_image(
|
||||
RelishTag::Checkpoint,
|
||||
0,
|
||||
lsn,
|
||||
new_checkpoint_bytes,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
|
||||
// "checkpoint" the repository to flush all the changes from WAL we've processed
|
||||
// so far to disk. After this, we don't need the original WAL anymore, and it
|
||||
// can be removed. This is probably too aggressive for production, but it's useful
|
||||
// to expose bugs now.
|
||||
//
|
||||
// TODO: We don't actually dare to remove the WAL. It's useful for debugging,
|
||||
// and we might it for logical decoding other things in the future. Although
|
||||
// we should also be able to fetch it back from the WAL safekeepers or S3 if
|
||||
// needed.
|
||||
if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
!= last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
{
|
||||
info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
|
||||
let (oldest_segno, newest_segno) = find_wal_file_range(
|
||||
conf,
|
||||
&timelineid,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
last_rec_lsn,
|
||||
tenantid,
|
||||
)?;
|
||||
|
||||
if newest_segno - oldest_segno >= 10 {
|
||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
||||
}
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
last_rec_lsn = lsn;
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
@@ -283,7 +272,7 @@ fn walreceiver_main(
|
||||
);
|
||||
|
||||
if reply_requested {
|
||||
Some(timeline.get_last_record_lsn())
|
||||
Some(last_rec_lsn)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -303,57 +292,24 @@ fn walreceiver_main(
|
||||
|
||||
physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
|
||||
}
|
||||
|
||||
if tenant_mgr::shutdown_requested() {
|
||||
debug!("stop walreceiver because pageserver shutdown is requested");
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_wal_file_range(
|
||||
conf: &PageServerConf,
|
||||
timeline: &ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
written_upto: Lsn,
|
||||
tenant: &ZTenantId,
|
||||
) -> Result<(u64, u64)> {
|
||||
let written_upto_segno = written_upto.segment_number(wal_seg_size);
|
||||
|
||||
let mut oldest_segno = written_upto_segno;
|
||||
let mut newest_segno = written_upto_segno;
|
||||
// Scan the wal directory, and count how many WAL filed we could remove
|
||||
let wal_dir = conf.wal_dir_path(timeline, tenant);
|
||||
for entry in fs::read_dir(wal_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let filename = path.file_name().unwrap().to_str().unwrap();
|
||||
|
||||
if IsXLogFileName(filename) {
|
||||
let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
|
||||
|
||||
if segno > written_upto_segno {
|
||||
// that's strange.
|
||||
warn!("there is a WAL file from future at {}", path.display());
|
||||
continue;
|
||||
}
|
||||
|
||||
oldest_segno = min(oldest_segno, segno);
|
||||
newest_segno = max(newest_segno, segno);
|
||||
}
|
||||
}
|
||||
// FIXME: would be good to assert that there are no gaps in the WAL files
|
||||
|
||||
Ok((oldest_segno, newest_segno))
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
///
|
||||
/// See the [postgres docs] for more details.
|
||||
///
|
||||
/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html
|
||||
#[derive(Debug)]
|
||||
// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as
|
||||
// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900
|
||||
#[allow(dead_code)]
|
||||
pub struct IdentifySystem {
|
||||
systemid: u64,
|
||||
timeline: u32,
|
||||
@@ -394,98 +350,3 @@ pub fn identify_system(client: &mut Client) -> Result<IdentifySystem, Error> {
|
||||
Err(IdentifyError.into())
|
||||
}
|
||||
}
|
||||
|
||||
fn write_wal_file(
|
||||
conf: &PageServerConf,
|
||||
startpos: Lsn,
|
||||
timelineid: &ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
buf: &[u8],
|
||||
tenantid: &ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut bytes_left: usize = buf.len();
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
let wal_dir = conf.wal_dir_path(timelineid, tenantid);
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size);
|
||||
|
||||
while bytes_left != 0 {
|
||||
let bytes_to_write;
|
||||
|
||||
/*
|
||||
* If crossing a WAL boundary, only write up until we reach wal
|
||||
* segment size.
|
||||
*/
|
||||
if xlogoff + bytes_left > wal_seg_size {
|
||||
bytes_to_write = wal_seg_size - xlogoff;
|
||||
} else {
|
||||
bytes_to_write = bytes_left;
|
||||
}
|
||||
|
||||
/* Open file */
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
segno,
|
||||
wal_seg_size,
|
||||
);
|
||||
let wal_file_path = wal_dir.join(wal_file_name.clone());
|
||||
let wal_file_partial_path = wal_dir.join(wal_file_name.clone() + ".partial");
|
||||
|
||||
{
|
||||
let mut wal_file: File;
|
||||
/* Try to open already completed segment */
|
||||
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
|
||||
wal_file = file;
|
||||
partial = false;
|
||||
} else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) {
|
||||
/* Try to open existed partial file */
|
||||
wal_file = file;
|
||||
partial = true;
|
||||
} else {
|
||||
/* Create and fill new partial file */
|
||||
partial = true;
|
||||
match OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.open(&wal_file_partial_path)
|
||||
{
|
||||
Ok(mut file) => {
|
||||
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
|
||||
file.write_all(ZERO_BLOCK)?;
|
||||
}
|
||||
wal_file = file;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
wal_file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?;
|
||||
|
||||
// FIXME: Flush the file
|
||||
//wal_file.sync_all()?;
|
||||
}
|
||||
/* Write was successful, advance our position */
|
||||
bytes_written += bytes_to_write;
|
||||
bytes_left -= bytes_to_write;
|
||||
start_pos += bytes_to_write as u64;
|
||||
xlogoff += bytes_to_write;
|
||||
|
||||
/* Did we reach the end of a WAL segment? */
|
||||
if start_pos.segment_offset(wal_seg_size) == 0 {
|
||||
xlogoff = 0;
|
||||
if partial {
|
||||
fs::rename(&wal_file_partial_path, &wal_file_path)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -22,8 +22,8 @@ use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cell::RefCell;
|
||||
use rand::Rng;
|
||||
use serde::Serialize;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
@@ -43,7 +43,7 @@ use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::{Timeline, WALRecord};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder::XlMultiXactCreate;
|
||||
use crate::waldecoder::XlXactParsedRecord;
|
||||
use crate::PageServerConf;
|
||||
@@ -54,13 +54,15 @@ use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::XLogRecord;
|
||||
|
||||
const WAL_REDO_WORKERS: usize = 1;
|
||||
|
||||
///
|
||||
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
|
||||
///
|
||||
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
|
||||
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
|
||||
pub struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
@@ -79,12 +81,11 @@ pub trait WalRedoManager: Send + Sync {
|
||||
/// the reords.
|
||||
fn request_redo(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
) -> Result<Bytes, WalRedoError>;
|
||||
}
|
||||
|
||||
@@ -97,12 +98,11 @@ pub struct DummyRedoManager {}
|
||||
impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
_timeline: &dyn Timeline,
|
||||
_rel: RelishTag,
|
||||
_blknum: u32,
|
||||
_lsn: Lsn,
|
||||
_base_img: Option<Bytes>,
|
||||
_records: Vec<WALRecord>,
|
||||
_records: Vec<(Lsn, WALRecord)>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
Err(WalRedoError::InvalidState)
|
||||
}
|
||||
@@ -143,7 +143,7 @@ pub struct PostgresRedoManager {
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
runtime: tokio::runtime::Runtime,
|
||||
process: Mutex<Option<PostgresRedoProcess>>,
|
||||
workers: [Mutex<Option<PostgresRedoProcess>>; WAL_REDO_WORKERS],
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -153,7 +153,7 @@ struct WalRedoRequest {
|
||||
lsn: Lsn,
|
||||
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
}
|
||||
|
||||
/// An error happened in WAL redo
|
||||
@@ -178,12 +178,11 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
///
|
||||
fn request_redo(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let start_time;
|
||||
let lock_time;
|
||||
@@ -199,7 +198,9 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
|
||||
start_time = Instant::now();
|
||||
let result = {
|
||||
let mut process_guard = self.process.lock().unwrap();
|
||||
let mut process_guard = self.workers[rand::thread_rng().gen_range(0..WAL_REDO_WORKERS)]
|
||||
.lock()
|
||||
.unwrap();
|
||||
lock_time = Instant::now();
|
||||
|
||||
// launch the WAL redo process on first use
|
||||
@@ -209,7 +210,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
.block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
|
||||
*process_guard = Some(p);
|
||||
}
|
||||
let process = (*process_guard).as_ref().unwrap();
|
||||
let process = process_guard.as_mut().unwrap();
|
||||
|
||||
self.runtime
|
||||
.block_on(self.handle_apply_request(process, &request))
|
||||
@@ -219,13 +220,6 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
|
||||
|
||||
if let Ok(page) = result {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&page);
|
||||
self.set_hint_bits(timeline, &mut buf, lsn, &request.records);
|
||||
return Ok(buf.freeze());
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
@@ -248,118 +242,7 @@ impl PostgresRedoManager {
|
||||
runtime,
|
||||
tenantid,
|
||||
conf,
|
||||
process: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn xid_status(&self, timeline: &dyn Timeline, xid: u32, lsn: Lsn) -> u8 {
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if let Ok(clog_page) = timeline.get_page_at_lsn_nowait(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
},
|
||||
rpageno,
|
||||
lsn,
|
||||
) {
|
||||
postgres_ffi::nonrelfile_utils::transaction_id_get_status(xid, &clog_page[..])
|
||||
} else {
|
||||
pg_constants::TRANSACTION_STATUS_IN_PROGRESS
|
||||
}
|
||||
}
|
||||
|
||||
fn set_hint_bits(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
page: &mut BytesMut,
|
||||
lsn: Lsn,
|
||||
records: &Vec<WALRecord>,
|
||||
) {
|
||||
let mut flags = LittleEndian::read_u16(
|
||||
&page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2],
|
||||
);
|
||||
if (flags & (pg_constants::PD_HEAP_RELATION | pg_constants::PD_NONHEAP_RELATION)) == 0 {
|
||||
// If type of relation was not determined yet,
|
||||
// then do it now
|
||||
for r in records {
|
||||
let xl_rmid = r.rec[pg_constants::XL_RMID_OFFS];
|
||||
if xl_rmid == pg_constants::RM_HEAP_ID || xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
flags |= pg_constants::PD_HEAP_RELATION;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (flags & pg_constants::PD_HEAP_RELATION) == 0 {
|
||||
flags |= pg_constants::PD_NONHEAP_RELATION;
|
||||
}
|
||||
LittleEndian::write_u16(
|
||||
&mut page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2],
|
||||
flags,
|
||||
);
|
||||
}
|
||||
if (flags & pg_constants::PD_HEAP_RELATION) != 0 {
|
||||
// Set hint bits for heap relation page
|
||||
let pd_lower = LittleEndian::read_u16(
|
||||
&page[pg_constants::PD_LOWER_OFFSET..pg_constants::PD_LOWER_OFFSET + 2],
|
||||
) as usize;
|
||||
let mut tid_offs = pg_constants::SIZE_OF_PAGE_HEADER_DATA;
|
||||
while tid_offs < pd_lower {
|
||||
let tid = LittleEndian::read_u32(&page[tid_offs..tid_offs + 4]);
|
||||
let lp_off = (tid & 0x7FFF) as usize;
|
||||
if ((tid >> 15) & 3) == pg_constants::LP_NORMAL {
|
||||
// normal item pointer
|
||||
let t_xmin = LittleEndian::read_u32(
|
||||
&page[lp_off + pg_constants::T_XMIN_OFFS
|
||||
..lp_off + pg_constants::T_XMIN_OFFS + 4],
|
||||
);
|
||||
let t_xmax = LittleEndian::read_u32(
|
||||
&page[lp_off + pg_constants::T_XMAX_OFFS
|
||||
..lp_off + pg_constants::T_XMAX_OFFS + 4],
|
||||
);
|
||||
let mut t_infomask = LittleEndian::read_u16(
|
||||
&page[lp_off + pg_constants::T_INFOMASK_OFFS
|
||||
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
|
||||
);
|
||||
if (t_infomask
|
||||
& (pg_constants::HEAP_XMIN_COMMITTED | pg_constants::HEAP_XMIN_INVALID))
|
||||
== 0
|
||||
&& t_xmin != 0
|
||||
{
|
||||
let status = self.xid_status(timeline, t_xmin, lsn);
|
||||
if status == pg_constants::TRANSACTION_STATUS_COMMITTED {
|
||||
t_infomask |= pg_constants::HEAP_XMIN_COMMITTED;
|
||||
} else if status == pg_constants::TRANSACTION_STATUS_ABORTED {
|
||||
t_infomask |= pg_constants::HEAP_XMIN_INVALID;
|
||||
}
|
||||
LittleEndian::write_u16(
|
||||
&mut page[lp_off + pg_constants::T_INFOMASK_OFFS
|
||||
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
|
||||
t_infomask,
|
||||
);
|
||||
}
|
||||
if (t_infomask
|
||||
& (pg_constants::HEAP_XMAX_COMMITTED
|
||||
| pg_constants::HEAP_XMAX_INVALID
|
||||
| pg_constants::HEAP_XMAX_IS_MULTI))
|
||||
== 0
|
||||
&& t_xmax != 0
|
||||
{
|
||||
let status = self.xid_status(timeline, t_xmax, lsn);
|
||||
if status == pg_constants::TRANSACTION_STATUS_COMMITTED {
|
||||
t_infomask |= pg_constants::HEAP_XMAX_COMMITTED;
|
||||
} else if status == pg_constants::TRANSACTION_STATUS_ABORTED {
|
||||
t_infomask |= pg_constants::HEAP_XMAX_INVALID;
|
||||
}
|
||||
LittleEndian::write_u16(
|
||||
&mut page[lp_off + pg_constants::T_INFOMASK_OFFS
|
||||
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
|
||||
t_infomask,
|
||||
);
|
||||
}
|
||||
}
|
||||
tid_offs += 4;
|
||||
}
|
||||
workers: [(); WAL_REDO_WORKERS].map(|_| Mutex::new(None)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -368,7 +251,7 @@ impl PostgresRedoManager {
|
||||
///
|
||||
async fn handle_apply_request(
|
||||
&self,
|
||||
process: &PostgresRedoProcess,
|
||||
process: &mut PostgresRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let rel = request.rel;
|
||||
@@ -399,7 +282,7 @@ impl PostgresRedoManager {
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
// Apply all collected WAL records
|
||||
for record in records {
|
||||
for (_lsn, record) in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
@@ -420,9 +303,11 @@ impl PostgresRedoManager {
|
||||
// Transaction manager stuff
|
||||
let rec_segno = match rel {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
if slru != SlruKind::Clog {
|
||||
panic!("Not valid XACT relish tag {:?}", rel);
|
||||
}
|
||||
assert!(
|
||||
slru == SlruKind::Clog,
|
||||
"Not valid XACT relish tag {:?}",
|
||||
rel
|
||||
);
|
||||
segno
|
||||
}
|
||||
_ => panic!("Not valid XACT relish tag {:?}", rel),
|
||||
@@ -542,7 +427,7 @@ impl PostgresRedoManager {
|
||||
);
|
||||
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {}", e);
|
||||
error!("could not apply WAL records: {:#}", e);
|
||||
result = Err(WalRedoError::IoError(e));
|
||||
} else {
|
||||
let img = apply_result.unwrap();
|
||||
@@ -559,8 +444,8 @@ impl PostgresRedoManager {
|
||||
/// Handle to the Postgres WAL redo process
|
||||
///
|
||||
struct PostgresRedoProcess {
|
||||
stdin: RefCell<ChildStdin>,
|
||||
stdout: RefCell<ChildStdout>,
|
||||
stdin: ChildStdin,
|
||||
stdout: ChildStdout,
|
||||
}
|
||||
|
||||
impl PostgresRedoProcess {
|
||||
@@ -580,7 +465,7 @@ impl PostgresRedoProcess {
|
||||
if datadir.exists() {
|
||||
info!("directory {:?} exists, removing", &datadir);
|
||||
if let Err(e) = fs::remove_dir_all(&datadir) {
|
||||
error!("could not remove old wal-redo-datadir: {:?}", e);
|
||||
error!("could not remove old wal-redo-datadir: {:#}", e);
|
||||
}
|
||||
}
|
||||
info!("running initdb in {:?}", datadir.display());
|
||||
@@ -653,10 +538,7 @@ impl PostgresRedoProcess {
|
||||
};
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
Ok(PostgresRedoProcess {
|
||||
stdin: RefCell::new(stdin),
|
||||
stdout: RefCell::new(stdout),
|
||||
})
|
||||
Ok(PostgresRedoProcess { stdin, stdout })
|
||||
}
|
||||
|
||||
//
|
||||
@@ -664,13 +546,14 @@ impl PostgresRedoProcess {
|
||||
// new page image.
|
||||
//
|
||||
async fn apply_wal_records(
|
||||
&self,
|
||||
&mut self,
|
||||
tag: BufferTag,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[WALRecord],
|
||||
records: &[(Lsn, WALRecord)],
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
let mut stdin = self.stdin.borrow_mut();
|
||||
let mut stdout = self.stdout.borrow_mut();
|
||||
let stdout = &mut self.stdout;
|
||||
// Buffer the writes to avoid a lot of small syscalls.
|
||||
let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
@@ -687,22 +570,16 @@ impl PostgresRedoProcess {
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
if base_img.is_some() {
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
)
|
||||
.await??;
|
||||
if let Some(img) = base_img {
|
||||
timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??;
|
||||
}
|
||||
|
||||
// Send WAL records.
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
|
||||
for (lsn, rec) in records.iter() {
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.write_all(&build_apply_record_msg(*lsn, &rec.rec))
|
||||
.await?;
|
||||
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
@@ -739,58 +616,41 @@ impl PostgresRedoProcess {
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec<u8> {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
|
||||
// FIXME: this is a temporary hack that should go away when we refactor
|
||||
// the postgres protocol serialization + handlers.
|
||||
//
|
||||
// BytesMut is a dynamic growable buffer, used a lot in tokio code but
|
||||
// not in the std library. To write to a BytesMut from a serde serializer,
|
||||
// we need to either:
|
||||
// - pre-allocate the required buffer space. This is annoying because we
|
||||
// shouldn't care what the exact serialized size is-- that's the
|
||||
// serializer's job.
|
||||
// - Or, we need to create a temporary "writer" (which implements the
|
||||
// `Write` trait). It's a bit awkward, because the writer consumes the
|
||||
// underlying BytesMut, and we need to extract it later with
|
||||
// `into_inner`.
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
tag.ser_into(&mut buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
buf
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec<u8> {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
tag.ser_into(&mut buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let mut buf = writer.into_inner();
|
||||
buf.put(base_img);
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
buf
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
let mut buf: Vec<u8> = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
@@ -799,21 +659,19 @@ fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
buf
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
fn build_get_page_msg(tag: BufferTag) -> Vec<u8> {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
tag.ser_into(&mut buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
buf
|
||||
}
|
||||
|
||||
@@ -58,7 +58,7 @@ impl ControlFileData {
|
||||
let expectedcrc = crc32c::crc32c(&buf[0..OFFSETOF_CRC]);
|
||||
|
||||
// Use serde to deserialize the input as a ControlFileData struct.
|
||||
let controlfile = ControlFileData::des(buf)?;
|
||||
let controlfile = ControlFileData::des_prefix(buf)?;
|
||||
|
||||
// Check the CRC
|
||||
if expectedcrc != controlfile.crc {
|
||||
|
||||
@@ -32,5 +32,5 @@ pub const fn transaction_id_precedes(id1: TransactionId, id2: TransactionId) ->
|
||||
}
|
||||
|
||||
let diff = id1.wrapping_sub(id2) as i32;
|
||||
return diff < 0;
|
||||
diff < 0
|
||||
}
|
||||
|
||||
@@ -46,7 +46,6 @@ pub const SIZE_OF_PAGE_HEADER: u16 = 24;
|
||||
pub const BITS_PER_HEAPBLOCK: u16 = 2;
|
||||
pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
|
||||
|
||||
pub const TRANSACTION_STATUS_IN_PROGRESS: u8 = 0x00;
|
||||
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
|
||||
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
|
||||
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
|
||||
@@ -192,31 +191,6 @@ pub const XLP_LONG_HEADER: u16 = 0x0002;
|
||||
|
||||
pub const PG_MAJORVERSION: &str = "14";
|
||||
|
||||
// Zenith specific page flags used to distinguish heap and non-heap relations
|
||||
pub const PD_HEAP_RELATION: u16 = 0x10;
|
||||
pub const PD_NONHEAP_RELATION: u16 = 0x20;
|
||||
|
||||
// bufpage.h
|
||||
pub const PD_FLAGS_OFFSET: usize = 10; // PageHeaderData.pd_flags
|
||||
pub const PD_LOWER_OFFSET: usize = 12; // PageHeaderData.pd_lower
|
||||
|
||||
// itemid.h
|
||||
pub const LP_NORMAL: u32 = 1;
|
||||
|
||||
// htup_details.h
|
||||
pub const T_XMIN_OFFS: usize = 0;
|
||||
pub const T_XMAX_OFFS: usize = 4;
|
||||
pub const T_INFOMASK_OFFS: usize = 4 * 3 + 2 * 3 + 2;
|
||||
pub const HEAP_XMIN_COMMITTED: u16 = 0x0100; /* t_xmin committed */
|
||||
pub const HEAP_XMIN_INVALID: u16 = 0x0200; /* t_xmin invalid/aborted */
|
||||
pub const HEAP_XMAX_COMMITTED: u16 = 0x0400; /* t_xmax committed */
|
||||
pub const HEAP_XMAX_INVALID: u16 = 0x0800; /* t_xmax invalid/aborted */
|
||||
pub const HEAP_XMAX_IS_MULTI: u16 = 0x1000; /* t_xmax is a MultiXactId */
|
||||
pub const SIZE_OF_PAGE_HEADER_DATA: usize = 24;
|
||||
|
||||
// xlogrecord.h
|
||||
pub const XL_RMID_OFFS: usize = 17;
|
||||
|
||||
// List of subdirectories inside pgdata.
|
||||
// Copied from src/bin/initdb/initdb.c
|
||||
pub const PGDATA_SUBDIRS: [&str; 22] = [
|
||||
|
||||
@@ -9,21 +9,23 @@
|
||||
|
||||
use crate::pg_constants;
|
||||
use crate::CheckPoint;
|
||||
use crate::ControlFileData;
|
||||
use crate::FullTransactionId;
|
||||
use crate::XLogLongPageHeaderData;
|
||||
use crate::XLogPageHeaderData;
|
||||
use crate::XLogRecord;
|
||||
use crate::XLOG_PAGE_MAGIC;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{Buf, Bytes};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::max;
|
||||
use std::cmp::min;
|
||||
use std::fs::{self, File};
|
||||
use std::io::prelude::*;
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -125,8 +127,10 @@ fn find_end_of_wal_segment(
|
||||
segno: XLogSegNo,
|
||||
tli: TimeLineID,
|
||||
wal_seg_size: usize,
|
||||
) -> u32 {
|
||||
let mut offs: usize = 0;
|
||||
start_offset: usize, // start reading at this point
|
||||
) -> Result<u32> {
|
||||
// step back to the beginning of the page to read it in...
|
||||
let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ;
|
||||
let mut contlen: usize = 0;
|
||||
let mut wal_crc: u32 = 0;
|
||||
let mut crc: u32 = 0;
|
||||
@@ -135,24 +139,33 @@ fn find_end_of_wal_segment(
|
||||
let file_name = XLogFileName(tli, segno, wal_seg_size);
|
||||
let mut last_valid_rec_pos: usize = 0;
|
||||
let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
|
||||
file.seek(SeekFrom::Start(offs as u64))?;
|
||||
let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS];
|
||||
|
||||
while offs < wal_seg_size {
|
||||
// we are at the beginning of the page; read it in
|
||||
if offs % XLOG_BLCKSZ == 0 {
|
||||
if let Ok(bytes_read) = file.read(&mut buf) {
|
||||
if bytes_read != buf.len() {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
let bytes_read = file.read(&mut buf)?;
|
||||
if bytes_read != buf.len() {
|
||||
bail!(
|
||||
"failed to read {} bytes from {} at {}",
|
||||
XLOG_BLCKSZ,
|
||||
file_name,
|
||||
offs
|
||||
);
|
||||
}
|
||||
|
||||
let xlp_magic = LittleEndian::read_u16(&buf[0..2]);
|
||||
let xlp_info = LittleEndian::read_u16(&buf[2..4]);
|
||||
let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]);
|
||||
// this is expected in current usage when valid WAL starts after page header
|
||||
if xlp_magic != XLOG_PAGE_MAGIC as u16 {
|
||||
info!("Invalid WAL file {}.partial magic {}", file_name, xlp_magic);
|
||||
break;
|
||||
trace!(
|
||||
"invalid WAL file {}.partial magic {} at {:?}",
|
||||
file_name,
|
||||
xlp_magic,
|
||||
Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
|
||||
);
|
||||
}
|
||||
if offs == 0 {
|
||||
offs = XLOG_SIZE_OF_XLOG_LONG_PHD;
|
||||
@@ -162,11 +175,18 @@ fn find_end_of_wal_segment(
|
||||
} else {
|
||||
offs += XLOG_SIZE_OF_XLOG_SHORT_PHD;
|
||||
}
|
||||
// ... and step forward again if asked
|
||||
offs = max(offs, start_offset);
|
||||
|
||||
// beginning of the next record
|
||||
} else if contlen == 0 {
|
||||
let page_offs = offs % XLOG_BLCKSZ;
|
||||
let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
|
||||
if xl_tot_len == 0 {
|
||||
info!(
|
||||
"find_end_of_wal_segment reached zeros at {:?}",
|
||||
Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size))
|
||||
);
|
||||
break; // zeros, reached the end
|
||||
}
|
||||
last_valid_rec_pos = offs;
|
||||
@@ -217,7 +237,7 @@ fn find_end_of_wal_segment(
|
||||
}
|
||||
}
|
||||
}
|
||||
last_valid_rec_pos as u32
|
||||
Ok(last_valid_rec_pos as u32)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -230,7 +250,8 @@ pub fn find_end_of_wal(
|
||||
data_dir: &Path,
|
||||
wal_seg_size: usize,
|
||||
precise: bool,
|
||||
) -> (XLogRecPtr, TimeLineID) {
|
||||
start_lsn: Lsn, // start reading WAL at this point or later
|
||||
) -> Result<(XLogRecPtr, TimeLineID)> {
|
||||
let mut high_segno: XLogSegNo = 0;
|
||||
let mut high_tli: TimeLineID = 0;
|
||||
let mut high_ispartial = false;
|
||||
@@ -272,19 +293,32 @@ pub fn find_end_of_wal(
|
||||
high_segno += 1;
|
||||
} else if precise {
|
||||
/* otherwise locate last record in last partial segment */
|
||||
high_offs = find_end_of_wal_segment(data_dir, high_segno, high_tli, wal_seg_size);
|
||||
if start_lsn.segment_number(wal_seg_size) > high_segno {
|
||||
bail!(
|
||||
"provided start_lsn {:?} is beyond highest segno {:?} available",
|
||||
start_lsn,
|
||||
high_segno,
|
||||
);
|
||||
}
|
||||
high_offs = find_end_of_wal_segment(
|
||||
data_dir,
|
||||
high_segno,
|
||||
high_tli,
|
||||
wal_seg_size,
|
||||
start_lsn.segment_offset(wal_seg_size),
|
||||
)?;
|
||||
}
|
||||
let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
|
||||
return (high_ptr, high_tli);
|
||||
return Ok((high_ptr, high_tli));
|
||||
}
|
||||
(0, 0)
|
||||
Ok((0, 0))
|
||||
}
|
||||
|
||||
pub fn main() {
|
||||
let mut data_dir = PathBuf::new();
|
||||
data_dir.push(".");
|
||||
let wal_seg_size = 16 * 1024 * 1024;
|
||||
let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true);
|
||||
let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true, Lsn(0)).unwrap();
|
||||
println!(
|
||||
"wal_end={:>08X}{:>08X}, tli={}",
|
||||
(wal_end >> 32) as u32,
|
||||
@@ -294,7 +328,12 @@ pub fn main() {
|
||||
}
|
||||
|
||||
impl XLogRecord {
|
||||
pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
|
||||
pub fn from_slice(buf: &[u8]) -> XLogRecord {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
XLogRecord::des(buf).unwrap()
|
||||
}
|
||||
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogRecord {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
XLogRecord::des_from(&mut buf.reader()).unwrap()
|
||||
}
|
||||
@@ -342,10 +381,12 @@ impl CheckPoint {
|
||||
Ok(CheckPoint::des(buf)?)
|
||||
}
|
||||
|
||||
// Update next XID based on provided new_xid and stored epoch.
|
||||
// Next XID should be greater than new_xid.
|
||||
// Also take in account 32-bit wrap-around.
|
||||
pub fn update_next_xid(&mut self, xid: u32) {
|
||||
/// Update next XID based on provided new_xid and stored epoch.
|
||||
/// Next XID should be greater than new_xid. This handles 32-bit
|
||||
/// XID wraparound correctly.
|
||||
///
|
||||
/// Returns 'true' if the XID was updated.
|
||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
|
||||
let full_xid = self.nextXid.value;
|
||||
let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
@@ -356,35 +397,37 @@ impl CheckPoint {
|
||||
// wrap-around
|
||||
epoch += 1;
|
||||
}
|
||||
self.nextXid = FullTransactionId {
|
||||
value: (epoch << 32) | new_xid as u64,
|
||||
};
|
||||
let nextXid = (epoch << 32) | new_xid as u64;
|
||||
|
||||
if nextXid != self.nextXid.value {
|
||||
self.nextXid = FullTransactionId { value: nextXid };
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
|
||||
// Generate new, empty WAL segment.
|
||||
// We need this segment to start compute node.
|
||||
// In order to minimize changes in Postgres core, we prefer to
|
||||
// provide WAL segment from which is can extract checkpoint record in standard way,
|
||||
// rather then implement some alternative mechanism.
|
||||
//
|
||||
pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
|
||||
pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
|
||||
let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);
|
||||
|
||||
let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let hdr = XLogLongPageHeaderData {
|
||||
std: {
|
||||
XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: pg_constants::XLP_LONG_HEADER,
|
||||
xlp_tli: 1, // FIXME: always use Postgres timeline 1
|
||||
xlp_pageaddr: pg_control.checkPoint - XLOG_SIZE_OF_XLOG_LONG_PHD as u64,
|
||||
xlp_pageaddr: pageaddr,
|
||||
xlp_rem_len: 0,
|
||||
..Default::default() // Put 0 in padding fields.
|
||||
}
|
||||
},
|
||||
xlp_sysid: pg_control.system_identifier,
|
||||
xlp_sysid: system_id,
|
||||
xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32,
|
||||
xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
|
||||
};
|
||||
@@ -392,36 +435,6 @@ pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
|
||||
let hdr_bytes = hdr.encode();
|
||||
seg_buf.extend_from_slice(&hdr_bytes);
|
||||
|
||||
let rec_hdr = XLogRecord {
|
||||
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD
|
||||
+ SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT
|
||||
+ SIZEOF_CHECKPOINT) as u32,
|
||||
xl_xid: 0, //0 is for InvalidTransactionId
|
||||
xl_prev: 0,
|
||||
xl_info: pg_constants::XLOG_CHECKPOINT_SHUTDOWN,
|
||||
xl_rmid: pg_constants::RM_XLOG_ID,
|
||||
xl_crc: 0,
|
||||
..Default::default() // Put 0 in padding fields.
|
||||
};
|
||||
|
||||
let mut rec_shord_hdr_bytes = BytesMut::new();
|
||||
rec_shord_hdr_bytes.put_u8(pg_constants::XLR_BLOCK_ID_DATA_SHORT);
|
||||
rec_shord_hdr_bytes.put_u8(SIZEOF_CHECKPOINT as u8);
|
||||
|
||||
let rec_bytes = rec_hdr.encode();
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
|
||||
//calculate record checksum
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &rec_shord_hdr_bytes[..]);
|
||||
crc = crc32c_append(crc, &checkpoint_bytes[..]);
|
||||
crc = crc32c_append(crc, &rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
|
||||
seg_buf.extend_from_slice(&rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
seg_buf.put_u32_le(crc);
|
||||
seg_buf.extend_from_slice(&rec_shord_hdr_bytes);
|
||||
seg_buf.extend_from_slice(&checkpoint_bytes);
|
||||
|
||||
//zero out the rest of the file
|
||||
seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
|
||||
seg_buf.freeze()
|
||||
@@ -463,7 +476,7 @@ mod tests {
|
||||
let wal_seg_size = 16 * 1024 * 1024;
|
||||
|
||||
// 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
|
||||
let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true);
|
||||
let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
|
||||
let wal_end = Lsn(wal_end);
|
||||
println!("wal_end={}, tli={}", wal_end, tli);
|
||||
assert_eq!(wal_end, "0/2000000".parse::<Lsn>().unwrap());
|
||||
@@ -489,7 +502,7 @@ mod tests {
|
||||
wal_dir.join("000000010000000000000001.partial"),
|
||||
)
|
||||
.unwrap();
|
||||
let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true);
|
||||
let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
|
||||
let wal_end = Lsn(wal_end);
|
||||
println!("wal_end={}, tli={}", wal_end, tli);
|
||||
assert_eq!(wal_end, waldump_wal_end);
|
||||
|
||||
@@ -14,9 +14,10 @@ rand = "0.8.3"
|
||||
hex = "0.4.3"
|
||||
serde = "1"
|
||||
serde_json = "1"
|
||||
tokio = { version = "1.7.1", features = ["full"] }
|
||||
tokio-postgres = "0.7.2"
|
||||
tokio = "1.11"
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
clap = "2.33.0"
|
||||
rustls = "0.19.1"
|
||||
reqwest = { version = "0.11", features = ["blocking", "json"] }
|
||||
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
@@ -1,17 +1,14 @@
|
||||
use anyhow::{bail, Result};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
net::{IpAddr, SocketAddr},
|
||||
};
|
||||
use std::net::{SocketAddr, ToSocketAddrs};
|
||||
|
||||
pub struct CPlaneApi {
|
||||
// address: SocketAddr,
|
||||
auth_endpoint: &'static str,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: IpAddr, // TODO: allow host name here too
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
pub dbname: String,
|
||||
pub user: String,
|
||||
@@ -19,8 +16,13 @@ pub struct DatabaseInfo {
|
||||
}
|
||||
|
||||
impl DatabaseInfo {
|
||||
pub fn socket_addr(&self) -> SocketAddr {
|
||||
SocketAddr::new(self.host, self.port)
|
||||
pub fn socket_addr(&self) -> Result<SocketAddr> {
|
||||
let host_port = format!("{}:{}", self.host, self.port);
|
||||
host_port
|
||||
.to_socket_addrs()
|
||||
.with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr"))
|
||||
}
|
||||
|
||||
pub fn conn_string(&self) -> String {
|
||||
@@ -31,62 +33,35 @@ impl DatabaseInfo {
|
||||
}
|
||||
}
|
||||
|
||||
// mock cplane api
|
||||
impl CPlaneApi {
|
||||
pub fn new(_address: &SocketAddr) -> CPlaneApi {
|
||||
CPlaneApi {
|
||||
// address: address.clone(),
|
||||
}
|
||||
pub fn new(auth_endpoint: &'static str) -> CPlaneApi {
|
||||
CPlaneApi { auth_endpoint }
|
||||
}
|
||||
|
||||
pub fn check_auth(&self, user: &str, md5_response: &[u8], salt: &[u8; 4]) -> Result<()> {
|
||||
// passwords for both is "mypass"
|
||||
let auth_map: HashMap<_, &str> = vec![
|
||||
("stas@zenith", "716ee6e1c4a9364d66285452c47402b1"),
|
||||
("stas2@zenith", "3996f75df64c16a8bfaf01301b61d582"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
pub fn authenticate_proxy_request(
|
||||
&self,
|
||||
user: &str,
|
||||
database: &str,
|
||||
md5_response: &[u8],
|
||||
salt: &[u8; 4],
|
||||
) -> Result<DatabaseInfo> {
|
||||
let mut url = reqwest::Url::parse(self.auth_endpoint)?;
|
||||
url.query_pairs_mut()
|
||||
.append_pair("login", user)
|
||||
.append_pair("database", database)
|
||||
.append_pair("md5response", std::str::from_utf8(md5_response)?)
|
||||
.append_pair("salt", &hex::encode(salt));
|
||||
|
||||
let stored_hash = auth_map
|
||||
.get(&user)
|
||||
.ok_or_else(|| anyhow::Error::msg("user not found"))?;
|
||||
let salted_stored_hash = format!(
|
||||
"md5{:x}",
|
||||
md5::compute([stored_hash.as_bytes(), salt].concat())
|
||||
);
|
||||
println!("cplane request: {}", url.as_str());
|
||||
|
||||
let received_hash = std::str::from_utf8(md5_response)?;
|
||||
let resp = reqwest::blocking::get(url)?;
|
||||
|
||||
println!(
|
||||
"auth: {} rh={} sh={} ssh={} {:?}",
|
||||
user, received_hash, stored_hash, salted_stored_hash, salt
|
||||
);
|
||||
|
||||
if received_hash == salted_stored_hash {
|
||||
Ok(())
|
||||
if resp.status().is_success() {
|
||||
let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?;
|
||||
println!("got conn info: #{:?}", conn_info);
|
||||
Ok(conn_info)
|
||||
} else {
|
||||
bail!("Auth failed")
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_database_uri(&self, _user: &str, _database: &str) -> Result<DatabaseInfo> {
|
||||
Ok(DatabaseInfo {
|
||||
host: "127.0.0.1".parse()?,
|
||||
port: 5432,
|
||||
dbname: "stas".to_string(),
|
||||
user: "stas".to_string(),
|
||||
password: "mypass".to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
// pub fn create_database(&self, _user: &String, _database: &String) -> Result<DatabaseInfo> {
|
||||
// Ok(DatabaseInfo {
|
||||
// host: "127.0.0.1".parse()?,
|
||||
// port: 5432,
|
||||
// dbname: "stas".to_string(),
|
||||
// user: "stas".to_string(),
|
||||
// password: "mypass".to_string(),
|
||||
// })
|
||||
// }
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ pub struct ProxyConf {
|
||||
pub redirect_uri: String,
|
||||
|
||||
/// control plane address where we would check auth.
|
||||
pub cplane_address: SocketAddr,
|
||||
pub auth_endpoint: String,
|
||||
|
||||
pub ssl_config: Option<Arc<ServerConfig>>,
|
||||
}
|
||||
@@ -56,8 +56,7 @@ fn configure_ssl(arg_matches: &ArgMatches) -> anyhow::Result<Option<Arc<ServerCo
|
||||
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("SSL key file")?;
|
||||
let mut keys = pemfile::rsa_private_keys(&mut &key_bytes[..])
|
||||
.or_else(|_| pemfile::pkcs8_private_keys(&mut &key_bytes[..]))
|
||||
let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.map_err(|_| anyhow!("couldn't read TLS keys"))?;
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().unwrap()
|
||||
@@ -102,6 +101,14 @@ fn main() -> anyhow::Result<()> {
|
||||
.help("redirect unauthenticated users to given uri")
|
||||
.default_value("http://localhost:3000/psql_session/"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("auth-endpoint")
|
||||
.short("a")
|
||||
.long("auth-endpoint")
|
||||
.takes_value(true)
|
||||
.help("redirect unauthenticated users to given uri")
|
||||
.default_value("http://localhost:3000/authenticate_proxy_request/"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("ssl-key")
|
||||
.short("k")
|
||||
@@ -122,7 +129,7 @@ fn main() -> anyhow::Result<()> {
|
||||
proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
|
||||
mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
|
||||
redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
|
||||
cplane_address: "127.0.0.1:3000".parse()?,
|
||||
auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
|
||||
ssl_config: configure_ssl(&arg_matches)?,
|
||||
};
|
||||
let state = ProxyState {
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::{
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::Deserialize;
|
||||
use zenith_utils::{
|
||||
postgres_backend::{self, query_from_cstring, AuthType, PostgresBackend},
|
||||
pq_proto::{BeMessage, SINGLE_COL_ROWDESC},
|
||||
@@ -34,7 +34,7 @@ pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow:
|
||||
|
||||
pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
let mut conn_handler = MgmtHandler { state };
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ struct MgmtHandler {
|
||||
// "host": "127.0.0.1",
|
||||
// "port": 5432,
|
||||
// "dbname": "stas",
|
||||
// "user": "stas"
|
||||
// "user": "stas",
|
||||
// "password": "mypass"
|
||||
// }
|
||||
// }
|
||||
@@ -60,13 +60,16 @@ struct MgmtHandler {
|
||||
// "Failure": "oops"
|
||||
// }
|
||||
// }
|
||||
#[derive(Serialize, Deserialize)]
|
||||
//
|
||||
// // to test manually by sending a query to mgmt interface:
|
||||
// psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
|
||||
#[derive(Deserialize)]
|
||||
pub struct PsqlSessionResponse {
|
||||
session_id: String,
|
||||
result: PsqlSessionResult,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Deserialize)]
|
||||
pub enum PsqlSessionResult {
|
||||
Success(DatabaseInfo),
|
||||
Failure(String),
|
||||
@@ -78,34 +81,47 @@ impl postgres_backend::Handler for MgmtHandler {
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
let query_string = query_from_cstring(query_string);
|
||||
let res = try_process_query(self, pgb, query_string);
|
||||
// intercept and log error message
|
||||
if res.is_err() {
|
||||
println!("Mgmt query failed: #{:?}", res);
|
||||
}
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
|
||||
fn try_process_query(
|
||||
mgmt: &mut MgmtHandler,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
let query_string = query_from_cstring(query_string);
|
||||
|
||||
let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
|
||||
println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
|
||||
|
||||
let waiters = self.state.waiters.lock().unwrap();
|
||||
let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
|
||||
|
||||
let sender = waiters
|
||||
.get(&resp.session_id)
|
||||
.ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
|
||||
let waiters = mgmt.state.waiters.lock().unwrap();
|
||||
|
||||
match resp.result {
|
||||
PsqlSessionResult::Success(db_info) => {
|
||||
sender.send(Ok(db_info))?;
|
||||
let sender = waiters
|
||||
.get(&resp.session_id)
|
||||
.ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
match resp.result {
|
||||
PsqlSessionResult::Success(db_info) => {
|
||||
sender.send(Ok(db_info))?;
|
||||
|
||||
PsqlSessionResult::Failure(message) => {
|
||||
sender.send(Err(anyhow::Error::msg(message.clone())))?;
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
bail!("psql session request failed: {}", message)
|
||||
}
|
||||
PsqlSessionResult::Failure(message) => {
|
||||
sender.send(Err(anyhow::Error::msg(message.clone())))?;
|
||||
|
||||
bail!("psql session request failed: {}", message)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,13 +57,14 @@ pub fn proxy_conn_main(
|
||||
) -> anyhow::Result<()> {
|
||||
let mut conn = ProxyConnection {
|
||||
state,
|
||||
cplane: CPlaneApi::new(&state.conf.cplane_address),
|
||||
cplane: CPlaneApi::new(&state.conf.auth_endpoint),
|
||||
user: "".into(),
|
||||
database: "".into(),
|
||||
pgb: PostgresBackend::new(
|
||||
socket,
|
||||
postgres_backend::AuthType::MD5,
|
||||
state.conf.ssl_config.clone(),
|
||||
false,
|
||||
)?,
|
||||
md5_salt: [0u8; 4],
|
||||
psql_session_id: "".into(),
|
||||
@@ -80,6 +81,8 @@ pub fn proxy_conn_main(
|
||||
conn.handle_new_user()?
|
||||
};
|
||||
|
||||
// XXX: move that inside handle_new_user/handle_existing_user to be able to
|
||||
// report wrong connection error.
|
||||
proxy_pass(conn.pgb, db_info)
|
||||
}
|
||||
|
||||
@@ -172,21 +175,31 @@ impl ProxyConnection {
|
||||
.split_last()
|
||||
.ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;
|
||||
|
||||
if let Err(e) = self.check_auth_md5(md5_response) {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
|
||||
bail!("auth failed: {}", e);
|
||||
} else {
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
}
|
||||
match self.cplane.authenticate_proxy_request(
|
||||
self.user.as_str(),
|
||||
self.database.as_str(),
|
||||
md5_response,
|
||||
&self.md5_salt,
|
||||
) {
|
||||
Err(e) => {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
|
||||
|
||||
// ok, we are authorized
|
||||
self.cplane.get_database_uri(&self.user, &self.database)
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
Ok(conn_info) => {
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
Ok(conn_info)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bail!("protocol violation");
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
@@ -232,17 +245,11 @@ databases without opening the browser.
|
||||
|
||||
Ok(dbinfo)
|
||||
}
|
||||
|
||||
fn check_auth_md5(&self, md5_response: &[u8]) -> anyhow::Result<()> {
|
||||
assert!(self.is_existing_user());
|
||||
self.cplane
|
||||
.check_auth(self.user.as_str(), md5_response, &self.md5_salt)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
|
||||
async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?;
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
|
||||
let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
|
||||
let _ = config.connect_raw(&mut socket, NoTls).await?;
|
||||
Ok(socket)
|
||||
@@ -273,7 +280,9 @@ fn proxy(
|
||||
|
||||
/// Proxy a client connection to a postgres database
|
||||
fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread().build()?;
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
let db_stream = runtime.block_on(connect_to_db(db_info))?;
|
||||
let db_stream = db_stream.into_std()?;
|
||||
db_stream.set_nonblocking(false)?;
|
||||
|
||||
@@ -8,4 +8,8 @@
|
||||
# warnings and errors right in the editor.
|
||||
# In vscode, this setting is Rust-analyzer>Check On Save:Command
|
||||
|
||||
cargo clippy "${@:2}" -- -A clippy::new_without_default -A clippy::manual_range_contains -A clippy::comparison_chain
|
||||
|
||||
# * `-A unknown_lints` – do not warn about unknown lint suppressions
|
||||
# that people with newer toolchains might use
|
||||
# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status)
|
||||
cargo clippy "${@:2}" --all-targets --all-features --all --tests -- -A unknown_lints -D warnings
|
||||
|
||||
@@ -9,11 +9,16 @@ psycopg2 = "*"
|
||||
typing-extensions = "*"
|
||||
pyjwt = {extras = ["crypto"], version = "*"}
|
||||
requests = "*"
|
||||
pytest-xdist = "*"
|
||||
asyncpg = "*"
|
||||
cached-property = "*"
|
||||
|
||||
[dev-packages]
|
||||
yapf = "*"
|
||||
flake8 = "*"
|
||||
mypy = "*"
|
||||
# Behavior may change slightly between versions. These are run continuously,
|
||||
# so we pin exact versions to avoid suprising breaks. Update if comfortable.
|
||||
yapf = "==0.31.0"
|
||||
|
||||
[requires]
|
||||
# we need at least 3.6, but pipenv doesn't allow to say this directly
|
||||
|
||||
266
test_runner/Pipfile.lock
generated
266
test_runner/Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "b666740289d9c82797e5c39b2a7f0074c865c9183ee878ce4fa5cda7928506ea"
|
||||
"sha256": "3645ae8d2dcf55bd2a54963c44cfeedf577f3b289d1077365214a80a7f36e643"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -16,6 +16,25 @@
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"asyncpg": {
|
||||
"hashes": [
|
||||
"sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
|
||||
"sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
|
||||
"sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
|
||||
"sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
|
||||
"sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
|
||||
"sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
|
||||
"sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
|
||||
"sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
|
||||
"sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
|
||||
"sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
|
||||
"sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
|
||||
"sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
|
||||
"sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.24.0"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
|
||||
@@ -24,97 +43,124 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.2.0"
|
||||
},
|
||||
"cached-property": {
|
||||
"hashes": [
|
||||
"sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
|
||||
"sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.5.2"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
|
||||
"sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
|
||||
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
|
||||
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
|
||||
],
|
||||
"version": "==2021.5.30"
|
||||
"version": "==2021.10.8"
|
||||
},
|
||||
"cffi": {
|
||||
"hashes": [
|
||||
"sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d",
|
||||
"sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771",
|
||||
"sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872",
|
||||
"sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c",
|
||||
"sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc",
|
||||
"sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762",
|
||||
"sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202",
|
||||
"sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5",
|
||||
"sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548",
|
||||
"sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a",
|
||||
"sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f",
|
||||
"sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20",
|
||||
"sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218",
|
||||
"sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c",
|
||||
"sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e",
|
||||
"sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56",
|
||||
"sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224",
|
||||
"sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a",
|
||||
"sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2",
|
||||
"sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a",
|
||||
"sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819",
|
||||
"sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346",
|
||||
"sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b",
|
||||
"sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e",
|
||||
"sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534",
|
||||
"sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb",
|
||||
"sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0",
|
||||
"sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156",
|
||||
"sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd",
|
||||
"sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87",
|
||||
"sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc",
|
||||
"sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195",
|
||||
"sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33",
|
||||
"sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f",
|
||||
"sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d",
|
||||
"sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd",
|
||||
"sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728",
|
||||
"sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7",
|
||||
"sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca",
|
||||
"sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99",
|
||||
"sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf",
|
||||
"sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e",
|
||||
"sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c",
|
||||
"sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5",
|
||||
"sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69"
|
||||
"sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
|
||||
"sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
|
||||
"sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
|
||||
"sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
|
||||
"sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
|
||||
"sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
|
||||
"sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
|
||||
"sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
|
||||
"sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
|
||||
"sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
|
||||
"sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
|
||||
"sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
|
||||
"sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
|
||||
"sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
|
||||
"sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
|
||||
"sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
|
||||
"sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
|
||||
"sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
|
||||
"sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
|
||||
"sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
|
||||
"sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
|
||||
"sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
|
||||
"sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
|
||||
"sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
|
||||
"sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
|
||||
"sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
|
||||
"sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
|
||||
"sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
|
||||
"sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
|
||||
"sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
|
||||
"sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
|
||||
"sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
|
||||
"sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
|
||||
"sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
|
||||
"sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
|
||||
"sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
|
||||
"sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
|
||||
"sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
|
||||
"sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
|
||||
"sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
|
||||
"sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
|
||||
"sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
|
||||
"sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
|
||||
"sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
|
||||
"sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
|
||||
"sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
|
||||
"sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
|
||||
"sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
|
||||
"sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
|
||||
"sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
|
||||
],
|
||||
"version": "==1.14.6"
|
||||
"version": "==1.15.0"
|
||||
},
|
||||
"charset-normalizer": {
|
||||
"hashes": [
|
||||
"sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
|
||||
"sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"
|
||||
"sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
|
||||
"sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==2.0.4"
|
||||
"version": "==2.0.7"
|
||||
},
|
||||
"cryptography": {
|
||||
"hashes": [
|
||||
"sha256:0f1212a66329c80d68aeeb39b8a16d54ef57071bf22ff4e521657b27372e327d",
|
||||
"sha256:1e056c28420c072c5e3cb36e2b23ee55e260cb04eee08f702e0edfec3fb51959",
|
||||
"sha256:240f5c21aef0b73f40bb9f78d2caff73186700bf1bc6b94285699aff98cc16c6",
|
||||
"sha256:26965837447f9c82f1855e0bc8bc4fb910240b6e0d16a664bb722df3b5b06873",
|
||||
"sha256:37340614f8a5d2fb9aeea67fd159bfe4f5f4ed535b1090ce8ec428b2f15a11f2",
|
||||
"sha256:3d10de8116d25649631977cb37da6cbdd2d6fa0e0281d014a5b7d337255ca713",
|
||||
"sha256:3d8427734c781ea5f1b41d6589c293089704d4759e34597dce91014ac125aad1",
|
||||
"sha256:7ec5d3b029f5fa2b179325908b9cd93db28ab7b85bb6c1db56b10e0b54235177",
|
||||
"sha256:8e56e16617872b0957d1c9742a3f94b43533447fd78321514abbe7db216aa250",
|
||||
"sha256:b01fd6f2737816cb1e08ed4807ae194404790eac7ad030b34f2ce72b332f5586",
|
||||
"sha256:bf40af59ca2465b24e54f671b2de2c59257ddc4f7e5706dbd6930e26823668d3",
|
||||
"sha256:de4e5f7f68220d92b7637fc99847475b59154b7a1b3868fb7385337af54ac9ca",
|
||||
"sha256:eb8cc2afe8b05acbd84a43905832ec78e7b3873fb124ca190f574dca7389a87d",
|
||||
"sha256:ee77aa129f481be46f8d92a1a7db57269a2f23052d5f2433b4621bb457081cc9"
|
||||
"sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
|
||||
"sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
|
||||
"sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
|
||||
"sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
|
||||
"sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
|
||||
"sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
|
||||
"sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
|
||||
"sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
|
||||
"sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
|
||||
"sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
|
||||
"sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
|
||||
"sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
|
||||
"sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
|
||||
"sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
|
||||
"sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
|
||||
"sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
|
||||
"sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
|
||||
"sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
|
||||
"sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
|
||||
"sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
|
||||
],
|
||||
"version": "==3.4.7"
|
||||
"version": "==35.0.0"
|
||||
},
|
||||
"execnet": {
|
||||
"hashes": [
|
||||
"sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
|
||||
"sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.9.0"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
|
||||
"sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
|
||||
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==3.2"
|
||||
"version": "==3.3"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
@@ -133,11 +179,11 @@
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
|
||||
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
|
||||
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
|
||||
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.13.1"
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.0.0"
|
||||
},
|
||||
"psycopg2": {
|
||||
"hashes": [
|
||||
@@ -175,11 +221,11 @@
|
||||
"crypto"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:934d73fbba91b0483d3857d1aff50e96b2a892384ee2c17417ed3203f173fca1",
|
||||
"sha256:fba44e7898bbca160a2b2b501f492824fc8382485d3a6f11ba5d0c1937ce6130"
|
||||
"sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
|
||||
"sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.1.0"
|
||||
"version": "==2.3.0"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
@@ -191,11 +237,27 @@
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b",
|
||||
"sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"
|
||||
"sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
|
||||
"sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==6.2.4"
|
||||
"version": "==6.2.5"
|
||||
},
|
||||
"pytest-forked": {
|
||||
"hashes": [
|
||||
"sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
|
||||
"sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"pytest-xdist": {
|
||||
"hashes": [
|
||||
"sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
|
||||
"sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
@@ -215,30 +277,30 @@
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497",
|
||||
"sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342",
|
||||
"sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"
|
||||
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
|
||||
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
|
||||
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.0"
|
||||
"version": "==3.10.0.2"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
|
||||
"sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
|
||||
"sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
|
||||
"sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.6"
|
||||
"version": "==1.26.7"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"flake8": {
|
||||
"hashes": [
|
||||
"sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b",
|
||||
"sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"
|
||||
"sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
|
||||
"sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.9.2"
|
||||
"version": "==4.0.1"
|
||||
},
|
||||
"mccabe": {
|
||||
"hashes": [
|
||||
@@ -285,19 +347,19 @@
|
||||
},
|
||||
"pycodestyle": {
|
||||
"hashes": [
|
||||
"sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068",
|
||||
"sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"
|
||||
"sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
|
||||
"sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.7.0"
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==2.8.0"
|
||||
},
|
||||
"pyflakes": {
|
||||
"hashes": [
|
||||
"sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3",
|
||||
"sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"
|
||||
"sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
|
||||
"sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.3.1"
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
@@ -309,12 +371,12 @@
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497",
|
||||
"sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342",
|
||||
"sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"
|
||||
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
|
||||
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
|
||||
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.0"
|
||||
"version": "==3.10.0.2"
|
||||
},
|
||||
"yapf": {
|
||||
"hashes": [
|
||||
|
||||
@@ -53,8 +53,8 @@ Useful environment variables:
|
||||
should go.
|
||||
`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
|
||||
|
||||
Let stdout and stderr go to the terminal instead of capturing them:
|
||||
`pytest -s ...`
|
||||
Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them:
|
||||
`pytest -s --log-cli-level=INFO ...`
|
||||
(Note many tests capture subprocess outputs separately, so this may not
|
||||
show much.)
|
||||
|
||||
@@ -95,11 +95,13 @@ Python destructors, e.g. `__del__()` aren't recommended for cleanup.
|
||||
|
||||
### Code quality
|
||||
|
||||
We force code formatting via yapf:
|
||||
|
||||
1. Install `yapf` and other tools (`flake8`, `mypy`) with `pipenv install --dev`.
|
||||
1. Reformat all your code by running `pipenv run yapf -ri .` in the `test_runner/` directory.
|
||||
|
||||
Before submitting a patch, please consider:
|
||||
|
||||
* Writing a couple of docstrings to clarify the reasoning behind a new test.
|
||||
* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
|
||||
* Formatting the code with `yapf -r -i .` (TODO: implement an opt-in pre-commit hook for that).
|
||||
* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.
|
||||
|
||||
The tools can be installed with `pipenv install --dev`.
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
|
||||
from contextlib import closing
|
||||
from typing import Iterator
|
||||
from uuid import uuid4
|
||||
import psycopg2
|
||||
from fixtures.zenith_fixtures import Postgres, ZenithCli, ZenithPageserver, PgBin
|
||||
from fixtures.zenith_fixtures import PortDistributor, Postgres, ZenithCli, ZenithPageserver, PgBin
|
||||
import pytest
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
|
||||
ps = pageserver_auth_enabled
|
||||
@@ -31,7 +33,9 @@ def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
|
||||
ps.safe_psql(f"tenant_create {uuid4().hex}", password=management_token)
|
||||
|
||||
# fail to create tenant using tenant token
|
||||
with pytest.raises(psycopg2.DatabaseError, match='Attempt to access management api with tenant scope. Permission denied'):
|
||||
with pytest.raises(
|
||||
psycopg2.DatabaseError,
|
||||
match='Attempt to access management api with tenant scope. Permission denied'):
|
||||
ps.safe_psql(f"tenant_create {uuid4().hex}", password=tenant_token)
|
||||
|
||||
|
||||
@@ -42,7 +46,8 @@ def test_compute_auth_to_pageserver(
|
||||
pageserver_auth_enabled: ZenithPageserver,
|
||||
repo_dir: str,
|
||||
with_wal_acceptors: bool,
|
||||
pg_bin: PgBin
|
||||
pg_bin: PgBin,
|
||||
port_distributor: PortDistributor,
|
||||
):
|
||||
ps = pageserver_auth_enabled
|
||||
# since we are in progress of refactoring protocols between compute safekeeper and page server
|
||||
@@ -55,14 +60,14 @@ def test_compute_auth_to_pageserver(
|
||||
wa_factory.start_n_new(3, management_token)
|
||||
|
||||
with Postgres(
|
||||
zenith_cli=zenith_cli,
|
||||
repo_dir=repo_dir,
|
||||
pg_bin=pg_bin,
|
||||
tenant_id=ps.initial_tenant,
|
||||
port=55432, # FIXME port distribution is hardcoded in tests and in cli
|
||||
zenith_cli=zenith_cli,
|
||||
repo_dir=repo_dir,
|
||||
pg_bin=pg_bin,
|
||||
tenant_id=ps.initial_tenant,
|
||||
port=port_distributor.get_port(),
|
||||
).create_start(
|
||||
branch,
|
||||
wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
|
||||
branch,
|
||||
wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
|
||||
) as pg:
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import subprocess
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -13,7 +13,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
zenith_cli.run(["branch", "test_branch_behind", "empty"])
|
||||
|
||||
pgmain = postgres.create_start('test_branch_behind')
|
||||
print("postgres is running on 'test_branch_behind' branch")
|
||||
log.info("postgres is running on 'test_branch_behind' branch")
|
||||
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
@@ -27,17 +27,17 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_a = main_cur.fetchone()[0]
|
||||
print('LSN after 100 rows: ' + lsn_a)
|
||||
log.info(f'LSN after 100 rows: {lsn_a}')
|
||||
|
||||
# Insert some more rows. (This generates enough WAL to fill a few segments.)
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
FROM generate_series(1, 200000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
print('LSN after 100100 rows: ' + lsn_b)
|
||||
log.info(f'LSN after 200100 rows: {lsn_b}')
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
|
||||
@@ -46,15 +46,15 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
FROM generate_series(1, 200000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
print('LSN after 200100 rows: ' + lsn_c)
|
||||
log.info(f'LSN after 400100 rows: {lsn_c}')
|
||||
|
||||
# Branch at the point where only 200 rows were inserted
|
||||
# Branch at the point where only 200100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
|
||||
|
||||
pg_hundred = postgres.create_start("test_branch_behind_hundred")
|
||||
@@ -70,11 +70,11 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
more_pg_conn = pg_more.connect()
|
||||
more_cur = more_pg_conn.cursor()
|
||||
more_cur.execute('SELECT count(*) FROM foo')
|
||||
assert more_cur.fetchone() == (100100, )
|
||||
assert more_cur.fetchone() == (200100, )
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (200100, )
|
||||
assert main_cur.fetchone() == (400100, )
|
||||
|
||||
# Check bad lsn's for branching
|
||||
|
||||
@@ -86,7 +86,10 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
assert cur.fetchone() == (1, )
|
||||
|
||||
# branch at pre-initdb lsn
|
||||
#
|
||||
# FIXME: This works currently, but probably shouldn't be allowed
|
||||
try:
|
||||
zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
|
||||
# FIXME: assert false, "branch with invalid LSN should have failed"
|
||||
except subprocess.CalledProcessError:
|
||||
print("Branch creation with pre-initdb LSN failed (as expected)")
|
||||
log.info("Branch creation with pre-initdb LSN failed (as expected)")
|
||||
|
||||
@@ -3,7 +3,8 @@ import os
|
||||
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -17,14 +18,17 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
|
||||
# set agressive autovacuum to make sure that truncation will happen
|
||||
config = [
|
||||
'autovacuum_max_workers=10', 'autovacuum_vacuum_threshold=0',
|
||||
'autovacuum_vacuum_insert_threshold=0', 'autovacuum_vacuum_cost_delay=0',
|
||||
'autovacuum_vacuum_cost_limit=10000', 'autovacuum_naptime =1s',
|
||||
'autovacuum_max_workers=10',
|
||||
'autovacuum_vacuum_threshold=0',
|
||||
'autovacuum_vacuum_insert_threshold=0',
|
||||
'autovacuum_vacuum_cost_delay=0',
|
||||
'autovacuum_vacuum_cost_limit=10000',
|
||||
'autovacuum_naptime =1s',
|
||||
'autovacuum_freeze_max_age=100000'
|
||||
]
|
||||
|
||||
pg = postgres.create_start('test_clog_truncate', config_lines=config)
|
||||
print('postgres is running on test_clog_truncate branch')
|
||||
log.info('postgres is running on test_clog_truncate branch')
|
||||
|
||||
# Install extension containing function needed for test
|
||||
pg.safe_psql('CREATE EXTENSION zenith_test_utils')
|
||||
@@ -33,22 +37,22 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('select test_consume_xids(1000*1000*10);')
|
||||
print('xids consumed')
|
||||
log.info('xids consumed')
|
||||
|
||||
# call a checkpoint to trigger TruncateSubtrans
|
||||
cur.execute('CHECKPOINT;')
|
||||
|
||||
# ensure WAL flush
|
||||
cur.execute('select txid_current()')
|
||||
print(cur.fetchone())
|
||||
log.info(cur.fetchone())
|
||||
|
||||
# wait for autovacuum to truncate the pg_xact
|
||||
# XXX Is it worth to add a timeout here?
|
||||
pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), '0000')
|
||||
print("pg_xact_0000_path = " + pg_xact_0000_path)
|
||||
log.info(f"pg_xact_0000_path = {pg_xact_0000_path}")
|
||||
|
||||
while os.path.isfile(pg_xact_0000_path):
|
||||
print("file exists. wait for truncation. " "pg_xact_0000_path = " + pg_xact_0000_path)
|
||||
log.info(f"file exists. wait for truncation. " "pg_xact_0000_path = {pg_xact_0000_path}")
|
||||
time.sleep(5)
|
||||
|
||||
# checkpoint to advance latest lsn
|
||||
@@ -59,14 +63,14 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
lsn_after_truncation = cur.fetchone()[0]
|
||||
|
||||
# create new branch after clog truncation and start a compute node on it
|
||||
print('create branch at lsn_after_truncation ' + lsn_after_truncation)
|
||||
log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
|
||||
zenith_cli.run(
|
||||
["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation])
|
||||
|
||||
pg2 = postgres.create_start('test_clog_truncate_new')
|
||||
print('postgres is running on test_clog_truncate_new branch')
|
||||
log.info('postgres is running on test_clog_truncate_new branch')
|
||||
|
||||
# check that new node doesn't contain truncated segment
|
||||
pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), '0000')
|
||||
print("pg_xact_0000_path_new = " + pg_xact_0000_path_new)
|
||||
log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}")
|
||||
assert os.path.isfile(pg_xact_0000_path_new) is False
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -14,7 +15,7 @@ def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFact
|
||||
|
||||
# change config
|
||||
pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
|
||||
print('postgres is running on test_config branch')
|
||||
log.info('postgres is running on test_config branch')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
@@ -3,6 +3,7 @@ import pathlib
|
||||
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -19,7 +20,7 @@ def test_createdb(
|
||||
zenith_cli.run(["branch", "test_createdb", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_createdb')
|
||||
print("postgres is running on 'test_createdb' branch")
|
||||
log.info("postgres is running on 'test_createdb' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -40,6 +41,7 @@ def test_createdb(
|
||||
for db in (pg, pg2):
|
||||
db.connect(dbname='foodb').close()
|
||||
|
||||
|
||||
#
|
||||
# Test DROP DATABASE
|
||||
#
|
||||
@@ -48,11 +50,12 @@ def test_dropdb(
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin,
|
||||
test_output_dir,
|
||||
):
|
||||
zenith_cli.run(["branch", "test_dropdb", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_dropdb')
|
||||
print("postgres is running on 'test_dropdb' branch")
|
||||
log.info("postgres is running on 'test_dropdb' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -64,7 +67,6 @@ def test_dropdb(
|
||||
cur.execute("SELECT oid FROM pg_database WHERE datname='foodb';")
|
||||
dboid = cur.fetchone()[0]
|
||||
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('DROP DATABASE foodb')
|
||||
@@ -74,7 +76,6 @@ def test_dropdb(
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_after_drop = cur.fetchone()[0]
|
||||
|
||||
|
||||
# Create two branches before and after database drop.
|
||||
zenith_cli.run(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
|
||||
pg_before = postgres.create_start('test_before_dropdb')
|
||||
@@ -87,15 +88,15 @@ def test_dropdb(
|
||||
|
||||
# Test that database subdir exists on the branch before drop
|
||||
dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid)
|
||||
print(dbpath)
|
||||
log.info(dbpath)
|
||||
|
||||
assert os.path.isdir(dbpath) == True
|
||||
|
||||
# Test that database subdir doesn't exist on the branch after drop
|
||||
dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid)
|
||||
print(dbpath)
|
||||
log.info(dbpath)
|
||||
|
||||
assert os.path.isdir(dbpath) == False
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(zenith_cli, pg, lsn_after_drop, postgres)
|
||||
check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -12,7 +13,7 @@ def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: Postgres
|
||||
zenith_cli.run(["branch", "test_createuser", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_createuser')
|
||||
print("postgres is running on 'test_createuser' branch")
|
||||
log.info("postgres is running on 'test_createuser' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -9,12 +10,17 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
# it only checks next_multixact_id field in restored pg_control,
|
||||
# since we don't have functions to check multixact internals.
|
||||
#
|
||||
def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, base_dir):
|
||||
def test_multixact(pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin,
|
||||
zenith_cli,
|
||||
base_dir,
|
||||
test_output_dir):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_multixact", "empty"])
|
||||
pg = postgres.create_start('test_multixact')
|
||||
|
||||
print("postgres is running on 'test_multixact' branch")
|
||||
log.info("postgres is running on 'test_multixact' branch")
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
@@ -54,7 +60,7 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_b
|
||||
zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
|
||||
pg_new = postgres.create_start('test_multixact_new')
|
||||
|
||||
print("postgres is running on 'test_multixact_new' branch")
|
||||
log.info("postgres is running on 'test_multixact_new' branch")
|
||||
pg_new_conn = pg_new.connect()
|
||||
cur_new = pg_new_conn.cursor()
|
||||
|
||||
@@ -65,4 +71,4 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_b
|
||||
assert next_multixact_id_new == next_multixact_id
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(zenith_cli, pg, lsn, postgres)
|
||||
check_restored_datadir_content(zenith_cli, test_output_dir, pg_new, pageserver.service_port.pg)
|
||||
|
||||
70
test_runner/batch_others/test_old_request_lsn.py
Normal file
70
test_runner/batch_others/test_old_request_lsn.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test where Postgres generates a lot of WAL, and it's garbage collected away, but
|
||||
# no pages are evicted so that Postgres uses an old LSN in a GetPage request.
|
||||
# We had a bug where the page server failed to find the page version because it
|
||||
# thought it was already garbage collected away, because the LSN in the GetPage
|
||||
# request was very old and the WAL from that time was indeed already removed.
|
||||
# In reality, the LSN on a GetPage request coming from a primary server is
|
||||
# just a hint that the page hasn't been modified since that LSN, and the page
|
||||
# server should return the latest page version regardless of the LSN.
|
||||
#
|
||||
def test_old_request_lsn(zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_old_request_lsn", "empty"])
|
||||
pg = postgres.create_start('test_old_request_lsn')
|
||||
log.info('postgres is running on test_old_request_lsn branch')
|
||||
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
psconn = pageserver.connect()
|
||||
pscur = psconn.cursor()
|
||||
|
||||
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
||||
# shared_buffers.
|
||||
cur.execute('CREATE TABLE foo (id int4 PRIMARY KEY, val int, t text)')
|
||||
cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT g, 1, 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
|
||||
# Verify that the table is larger than shared_buffers, so that the SELECT below
|
||||
# will cause GetPage requests.
|
||||
cur.execute('''
|
||||
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
|
||||
from pg_settings where name = 'shared_buffers'
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
log.info(f'shared_buffers is {row[0]}, table size {row[1]}')
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
cur.execute('VACUUM foo')
|
||||
|
||||
# Make a lot of updates on a single row, generating a lot of WAL. Trigger
|
||||
# garbage collections so that the page server will remove old page versions.
|
||||
for i in range(10):
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
for j in range(100):
|
||||
cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;')
|
||||
|
||||
# All (or at least most of) the updates should've been on the same page, so
|
||||
# that we haven't had to evict any dirty pages for a long time. Now run
|
||||
# a query that sends GetPage@LSN requests with the old LSN.
|
||||
cur.execute("SELECT COUNT(*), SUM(val) FROM foo")
|
||||
assert cur.fetchone() == (100000, 101000)
|
||||
@@ -63,7 +63,8 @@ def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
|
||||
cur = conn.cursor()
|
||||
|
||||
# check same tenant cannot be created twice
|
||||
with pytest.raises(psycopg2.DatabaseError, match=f'tenant {pageserver.initial_tenant} already exists'):
|
||||
with pytest.raises(psycopg2.DatabaseError,
|
||||
match=f'tenant {pageserver.initial_tenant} already exists'):
|
||||
cur.execute(f'tenant_create {pageserver.initial_tenant}')
|
||||
|
||||
# create one more tenant
|
||||
@@ -102,5 +103,6 @@ def test_pageserver_http_api_client(pageserver: ZenithPageserver):
|
||||
|
||||
|
||||
def test_pageserver_http_api_client_auth_enabled(pageserver_auth_enabled: ZenithPageserver):
|
||||
client = pageserver_auth_enabled.http_client(auth_token=pageserver_auth_enabled.auth_keys.generate_management_token())
|
||||
client = pageserver_auth_enabled.http_client(
|
||||
auth_token=pageserver_auth_enabled.auth_keys.generate_management_token())
|
||||
check_client(client, pageserver_auth_enabled.initial_tenant)
|
||||
|
||||
@@ -5,20 +5,24 @@ import time
|
||||
from contextlib import closing
|
||||
from multiprocessing import Process, Value
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
|
||||
# times, with fault_probability chance of getting a wal acceptor down or up
|
||||
# along the way. 2 of 3 are always alive, so the work keeps going.
|
||||
def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
|
||||
def test_pageserver_restart(zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
wa_factory: WalAcceptorFactory):
|
||||
|
||||
# One safekeeper is enough for this test.
|
||||
wa_factory.start_n_new(1)
|
||||
|
||||
zenith_cli.run(["branch", "test_pageserver_restart", "empty"])
|
||||
pg = postgres.create_start('test_pageserver_restart',
|
||||
wal_acceptors=wa_factory.get_connstrs())
|
||||
pg = postgres.create_start('test_pageserver_restart', wal_acceptors=wa_factory.get_connstrs())
|
||||
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
@@ -40,14 +44,14 @@ def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres:
|
||||
from pg_settings where name = 'shared_buffers'
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
print("shared_buffers is {}, table size {}", row[0], row[1]);
|
||||
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
# Stop and restart pageserver. This is a more or less graceful shutdown, although
|
||||
# the page server doesn't currently have a shutdown routine so there's no difference
|
||||
# between stopping and crashing.
|
||||
pageserver.stop();
|
||||
pageserver.start();
|
||||
pageserver.stop()
|
||||
pageserver.start()
|
||||
|
||||
# Stopping the pageserver breaks the connection from the postgres backend to
|
||||
# the page server, and causes the next query on the connection to fail. Start a new
|
||||
@@ -61,6 +65,5 @@ def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres:
|
||||
assert cur.fetchone() == (100000, )
|
||||
|
||||
# Stop the page server by force, and restart it
|
||||
pageserver.stop();
|
||||
pageserver.start();
|
||||
|
||||
pageserver.stop()
|
||||
pageserver.start()
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from fixtures.zenith_fixtures import PostgresFactory
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -8,7 +9,7 @@ def test_pgbench(postgres: PostgresFactory, pg_bin, zenith_cli):
|
||||
zenith_cli.run(["branch", "test_pgbench", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_pgbench')
|
||||
print("postgres is running on 'test_pgbench' branch")
|
||||
log.info("postgres is running on 'test_pgbench' branch")
|
||||
|
||||
connstr = pg.connstr()
|
||||
|
||||
|
||||
89
test_runner/batch_others/test_readonly_node.py
Normal file
89
test_runner/batch_others/test_readonly_node.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import subprocess
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Create read-only compute nodes, anchored at historical points in time.
|
||||
#
|
||||
# This is very similar to the 'test_branch_behind' test, but instead of
|
||||
# creating branches, creates read-only nodes.
|
||||
#
|
||||
def test_readonly_node(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
||||
zenith_cli.run(["branch", "test_readonly_node", "empty"])
|
||||
|
||||
pgmain = postgres.create_start('test_readonly_node')
|
||||
print("postgres is running on 'test_readonly_node' branch")
|
||||
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
|
||||
# Create table, and insert the first 100 rows
|
||||
main_cur.execute('CREATE TABLE foo (t text)')
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_a = main_cur.fetchone()[0]
|
||||
print('LSN after 100 rows: ' + lsn_a)
|
||||
|
||||
# Insert some more rows. (This generates enough WAL to fill a few segments.)
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 200000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
print('LSN after 200100 rows: ' + lsn_b)
|
||||
|
||||
# Insert many more rows. This generates enough WAL to fill a few segments.
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 200000) g
|
||||
''')
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
print('LSN after 400100 rows: ' + lsn_c)
|
||||
|
||||
# Create first read-only node at the point where only 100 rows were inserted
|
||||
pg_hundred = postgres.create_start("test_readonly_node_hundred",
|
||||
branch=f'test_readonly_node@{lsn_a}')
|
||||
|
||||
# And another at the point where 200100 rows were inserted
|
||||
pg_more = postgres.create_start("test_readonly_node_more", branch=f'test_readonly_node@{lsn_b}')
|
||||
|
||||
# On the 'hundred' node, we should see only 100 rows
|
||||
hundred_pg_conn = pg_hundred.connect()
|
||||
hundred_cur = hundred_pg_conn.cursor()
|
||||
hundred_cur.execute('SELECT count(*) FROM foo')
|
||||
assert hundred_cur.fetchone() == (100, )
|
||||
|
||||
# On the 'more' node, we should see 100200 rows
|
||||
more_pg_conn = pg_more.connect()
|
||||
more_cur = more_pg_conn.cursor()
|
||||
more_cur.execute('SELECT count(*) FROM foo')
|
||||
assert more_cur.fetchone() == (200100, )
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (400100, )
|
||||
|
||||
# Check creating a node at segment boundary
|
||||
pg = postgres.create_start("test_branch_segment_boundary",
|
||||
branch="test_readonly_node@0/3000000")
|
||||
cur = pg.connect().cursor()
|
||||
cur.execute('SELECT 1')
|
||||
assert cur.fetchone() == (1, )
|
||||
|
||||
# Create node at pre-initdb lsn
|
||||
try:
|
||||
zenith_cli.run(["pg", "start", "test_branch_preinitdb", "test_readonly_node@0/42"])
|
||||
assert false, "compute node startup with invalid LSN should have failed"
|
||||
except Exception:
|
||||
print("Node creation with pre-initdb LSN failed (as expected)")
|
||||
@@ -2,6 +2,7 @@ import pytest
|
||||
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -9,18 +10,15 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
#
|
||||
# Test restarting and recreating a postgres instance
|
||||
#
|
||||
# XXX: with_wal_acceptors=True fails now, would be fixed with
|
||||
# `postgres --sync-walkeepers` patches.
|
||||
#
|
||||
@pytest.mark.parametrize('with_wal_acceptors', [False])
|
||||
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
|
||||
def test_restart_compute(
|
||||
zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin,
|
||||
wa_factory,
|
||||
with_wal_acceptors: bool,
|
||||
):
|
||||
zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin,
|
||||
wa_factory,
|
||||
with_wal_acceptors: bool,
|
||||
):
|
||||
wal_acceptor_connstrs = None
|
||||
zenith_cli.run(["branch", "test_restart_compute", "empty"])
|
||||
|
||||
@@ -28,9 +26,8 @@ def test_restart_compute(
|
||||
wa_factory.start_n_new(3)
|
||||
wal_acceptor_connstrs = wa_factory.get_connstrs()
|
||||
|
||||
pg = postgres.create_start('test_restart_compute',
|
||||
wal_acceptors=wal_acceptor_connstrs)
|
||||
print("postgres is running on 'test_restart_compute' branch")
|
||||
pg = postgres.create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
|
||||
log.info("postgres is running on 'test_restart_compute' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -39,12 +36,10 @@ def test_restart_compute(
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
r = cur.fetchone()
|
||||
assert r == (5000050000, )
|
||||
print("res = ", r)
|
||||
log.info(f"res = {r}")
|
||||
|
||||
# Remove data directory and restart
|
||||
pg.stop_and_destroy().create_start('test_restart_compute',
|
||||
wal_acceptors=wal_acceptor_connstrs)
|
||||
|
||||
pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -52,7 +47,7 @@ def test_restart_compute(
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
r = cur.fetchone()
|
||||
assert r == (5000050000, )
|
||||
print("res = ", r)
|
||||
log.info(f"res = {r}")
|
||||
|
||||
# Insert another row
|
||||
cur.execute("INSERT INTO t VALUES (100001, 'payload2')")
|
||||
@@ -60,11 +55,10 @@ def test_restart_compute(
|
||||
|
||||
r = cur.fetchone()
|
||||
assert r == (100001, )
|
||||
print("res = ", r)
|
||||
log.info(f"res = {r}")
|
||||
|
||||
# Again remove data directory and restart
|
||||
pg.stop_and_destroy().create_start('test_restart_compute',
|
||||
wal_acceptors=wal_acceptor_connstrs)
|
||||
pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
|
||||
|
||||
# That select causes lots of FPI's and increases probability of wakeepers
|
||||
# lagging behind after query completion
|
||||
@@ -75,11 +69,10 @@ def test_restart_compute(
|
||||
|
||||
r = cur.fetchone()
|
||||
assert r == (100001, )
|
||||
print("res = ", r)
|
||||
log.info(f"res = {r}")
|
||||
|
||||
# And again remove data directory and restart
|
||||
pg.stop_and_destroy().create_start('test_restart_compute',
|
||||
wal_acceptors=wal_acceptor_connstrs)
|
||||
pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -88,4 +81,4 @@ def test_restart_compute(
|
||||
|
||||
r = cur.fetchone()
|
||||
assert r == (100001, )
|
||||
print("res = ", r)
|
||||
log.info(f"res = {r}")
|
||||
|
||||
@@ -1,13 +1,19 @@
|
||||
from contextlib import closing
|
||||
import psycopg2.extras
|
||||
import time;
|
||||
import time
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def print_gc_result(row):
|
||||
print("GC duration {elapsed} ms".format_map(row));
|
||||
print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row))
|
||||
print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row))
|
||||
log.info("GC duration {elapsed} ms".format_map(row))
|
||||
log.info(
|
||||
" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}"
|
||||
.format_map(row))
|
||||
log.info(
|
||||
" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}"
|
||||
.format_map(row))
|
||||
|
||||
|
||||
#
|
||||
@@ -23,7 +29,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
with closing(pageserver.connect()) as psconn:
|
||||
with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
|
||||
with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
@@ -33,9 +39,9 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
cur.execute("CREATE TABLE foo(x integer)")
|
||||
cur.execute("INSERT INTO foo VALUES (1)")
|
||||
|
||||
cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass");
|
||||
row = cur.fetchone();
|
||||
print("relfilenode is {}", row[0]);
|
||||
cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass")
|
||||
row = cur.fetchone()
|
||||
log.info(f"relfilenode is {row[0]}")
|
||||
|
||||
# Run GC, to clear out any garbage left behind in the catalogs by
|
||||
# the CREATE TABLE command. We want to have a clean slate with no garbage
|
||||
@@ -50,55 +56,58 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# update to confuse our numbers either.
|
||||
cur.execute("DELETE FROM foo")
|
||||
|
||||
print("Running GC before test")
|
||||
log.info("Running GC before test")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
print_gc_result(row)
|
||||
# remember the number of files
|
||||
layer_relfiles_remain = row['layer_relfiles_total'] - row['layer_relfiles_removed']
|
||||
layer_relfiles_remain = (row['layer_relfiles_total'] -
|
||||
row['layer_relfiles_removed'])
|
||||
assert layer_relfiles_remain > 0
|
||||
|
||||
# Insert a row.
|
||||
print("Inserting one row and running GC")
|
||||
# Insert a row and run GC. Checkpoint should freeze the layer
|
||||
# so that there is only the most recent image layer left for the rel,
|
||||
# removing the old image and delta layer.
|
||||
log.info("Inserting one row and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (1)")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
|
||||
assert row['layer_relfiles_removed'] == 1
|
||||
print_gc_result(row)
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
|
||||
assert row['layer_relfiles_removed'] == 2
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Insert two more rows and run GC.
|
||||
# This should create a new layer file with the new contents, and
|
||||
# remove the old one.
|
||||
print("Inserting two more rows and running GC")
|
||||
# This should create new image and delta layer file with the new contents, and
|
||||
# then remove the old one image and the just-created delta layer.
|
||||
log.info("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
|
||||
assert row['layer_relfiles_removed'] == 1
|
||||
print_gc_result(row)
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
|
||||
assert row['layer_relfiles_removed'] == 2
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Do it again. Should again create a new layer file and remove old one.
|
||||
print("Inserting two more rows and running GC")
|
||||
# Do it again. Should again create two new layer files and remove old ones.
|
||||
log.info("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
|
||||
assert row['layer_relfiles_removed'] == 1
|
||||
print_gc_result(row)
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
|
||||
assert row['layer_relfiles_removed'] == 2
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Run GC again, with no changes in the database. Should not remove anything.
|
||||
print("Run GC again, with nothing to do")
|
||||
log.info("Run GC again, with nothing to do")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
print_gc_result(row)
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain
|
||||
assert row['layer_relfiles_removed'] == 0
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
@@ -106,19 +115,26 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
#
|
||||
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
|
||||
#
|
||||
print("Drop table and run GC again");
|
||||
log.info("Drop table and run GC again")
|
||||
cur.execute("DROP TABLE foo")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
print_gc_result(row)
|
||||
|
||||
# We still cannot remove the latest layers
|
||||
# because they serve as tombstones for earlier layers.
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
assert row['layer_relfiles_dropped'] == 3
|
||||
assert row['layer_relfiles_needed_as_tombstone'] == 3
|
||||
|
||||
# The catalog updates also create new layer files of the catalogs, which
|
||||
# are counted as 'removed'
|
||||
assert row['layer_relfiles_removed'] > 0
|
||||
|
||||
# TODO Change the test to check actual CG of dropped layers.
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
#assert row['layer_relfiles_dropped'] == 3
|
||||
|
||||
# TODO: perhaps we should count catalog and user relations separately,
|
||||
# to make this kind of testing more robust
|
||||
|
||||
@@ -21,18 +21,30 @@ def test_tenants_normal_work(
|
||||
tenant_1 = tenant_factory.create()
|
||||
tenant_2 = tenant_factory.create()
|
||||
|
||||
zenith_cli.run(["branch", f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", "main", f"--tenantid={tenant_1}"])
|
||||
zenith_cli.run(["branch", f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", "main", f"--tenantid={tenant_2}"])
|
||||
zenith_cli.run([
|
||||
"branch",
|
||||
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
|
||||
"main",
|
||||
f"--tenantid={tenant_1}"
|
||||
])
|
||||
zenith_cli.run([
|
||||
"branch",
|
||||
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
|
||||
"main",
|
||||
f"--tenantid={tenant_2}"
|
||||
])
|
||||
if with_wal_acceptors:
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
pg_tenant1 = postgres.create_start(
|
||||
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
|
||||
None, # branch name, None means same as node name
|
||||
tenant_1,
|
||||
wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
|
||||
)
|
||||
pg_tenant2 = postgres.create_start(
|
||||
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
|
||||
None, # branch name, None means same as node name
|
||||
tenant_2,
|
||||
wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
|
||||
)
|
||||
@@ -45,4 +57,4 @@ def test_tenants_normal_work(
|
||||
cur.execute("CREATE TABLE t(key int primary key, value text)")
|
||||
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
||||
cur.execute("SELECT sum(key) FROM t")
|
||||
assert cur.fetchone() == (5000050000,)
|
||||
assert cur.fetchone() == (5000050000, )
|
||||
|
||||
36
test_runner/batch_others/test_timeline_size.py
Normal file
36
test_runner/batch_others/test_timeline_size.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from contextlib import closing
|
||||
from uuid import UUID
|
||||
import psycopg2.extras
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
def test_timeline_size(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_timeline_size", "empty"])
|
||||
|
||||
client = pageserver.http_client()
|
||||
res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
|
||||
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
|
||||
|
||||
pgmain = postgres.create_start("test_timeline_size")
|
||||
log.info("postgres is running on 'test_timeline_size' branch")
|
||||
|
||||
with closing(pgmain.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
|
||||
# Create table, and insert the first 100 rows
|
||||
cur.execute("CREATE TABLE foo (t text)")
|
||||
cur.execute("""
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 10) g
|
||||
""")
|
||||
|
||||
res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
|
||||
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
|
||||
cur.execute("TRUNCATE foo")
|
||||
|
||||
res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
|
||||
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, PgBin
|
||||
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -9,11 +9,14 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
#
|
||||
# Test branching, when a transaction is in prepared state
|
||||
#
|
||||
def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin: PgBin):
|
||||
def test_twophase(zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin: PgBin):
|
||||
zenith_cli.run(["branch", "test_twophase", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
|
||||
print("postgres is running on 'test_twophase' branch")
|
||||
log.info("postgres is running on 'test_twophase' branch")
|
||||
|
||||
conn = pg.connect()
|
||||
cur = conn.cursor()
|
||||
@@ -45,7 +48,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
cur.execute('CHECKPOINT')
|
||||
|
||||
twophase_files = os.listdir(pg.pg_twophase_dir_path())
|
||||
print(twophase_files)
|
||||
log.info(twophase_files)
|
||||
assert len(twophase_files) == 4
|
||||
|
||||
cur.execute("COMMIT PREPARED 'insert_three'")
|
||||
@@ -53,7 +56,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
cur.execute('CHECKPOINT')
|
||||
|
||||
twophase_files = os.listdir(pg.pg_twophase_dir_path())
|
||||
print(twophase_files)
|
||||
log.info(twophase_files)
|
||||
assert len(twophase_files) == 2
|
||||
|
||||
# Create a branch with the transaction in prepared state
|
||||
@@ -67,7 +70,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
|
||||
# Check that we restored only needed twophase files
|
||||
twophase_files2 = os.listdir(pg2.pg_twophase_dir_path())
|
||||
print(twophase_files2)
|
||||
log.info(twophase_files2)
|
||||
assert twophase_files2.sort() == twophase_files.sort()
|
||||
|
||||
conn2 = pg2.connect()
|
||||
@@ -79,8 +82,8 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
cur2.execute("ROLLBACK PREPARED 'insert_two'")
|
||||
|
||||
cur2.execute('SELECT * FROM foo')
|
||||
assert cur2.fetchall() == [('one',), ('three',)]
|
||||
assert cur2.fetchall() == [('one', ), ('three', )]
|
||||
|
||||
# Only one committed insert is visible on the original branch
|
||||
cur.execute('SELECT * FROM foo')
|
||||
assert cur.fetchall() == [('three',)]
|
||||
assert cur.fetchall() == [('three', )]
|
||||
|
||||
@@ -1,17 +1,23 @@
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test that the VM bit is cleared correctly at a HEAP_DELETE and
|
||||
# HEAP_UPDATE record.
|
||||
#
|
||||
def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, base_dir):
|
||||
def test_vm_bit_clear(pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin,
|
||||
zenith_cli,
|
||||
base_dir):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_vm_bit_clear", "empty"])
|
||||
pg = postgres.create_start('test_vm_bit_clear')
|
||||
|
||||
print("postgres is running on 'test_vm_bit_clear' branch")
|
||||
log.info("postgres is running on 'test_vm_bit_clear' branch")
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
@@ -48,13 +54,12 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
|
||||
''')
|
||||
|
||||
cur.execute('SELECT * FROM vmtest_delete WHERE id = 1')
|
||||
assert(cur.fetchall() == []);
|
||||
assert (cur.fetchall() == [])
|
||||
cur.execute('SELECT * FROM vmtest_update WHERE id = 1')
|
||||
assert(cur.fetchall() == []);
|
||||
assert (cur.fetchall() == [])
|
||||
|
||||
cur.close()
|
||||
|
||||
|
||||
# Check the same thing on the branch that we created right after the DELETE
|
||||
#
|
||||
# As of this writing, the code in smgrwrite() creates a full-page image whenever
|
||||
@@ -63,7 +68,7 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
|
||||
# server at the right point-in-time avoids that full-page image.
|
||||
pg_new = postgres.create_start('test_vm_bit_clear_new')
|
||||
|
||||
print("postgres is running on 'test_vm_bit_clear_new' branch")
|
||||
log.info("postgres is running on 'test_vm_bit_clear_new' branch")
|
||||
pg_new_conn = pg_new.connect()
|
||||
cur_new = pg_new_conn.cursor()
|
||||
|
||||
@@ -74,6 +79,6 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
|
||||
''')
|
||||
|
||||
cur_new.execute('SELECT * FROM vmtest_delete WHERE id = 1')
|
||||
assert(cur_new.fetchall() == []);
|
||||
assert (cur_new.fetchall() == [])
|
||||
cur_new.execute('SELECT * FROM vmtest_update WHERE id = 1')
|
||||
assert(cur_new.fetchall() == []);
|
||||
assert (cur_new.fetchall() == [])
|
||||
|
||||
@@ -1,17 +1,25 @@
|
||||
import pytest
|
||||
import random
|
||||
import time
|
||||
import os
|
||||
import subprocess
|
||||
import uuid
|
||||
|
||||
from contextlib import closing
|
||||
from multiprocessing import Process, Value
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory, PgBin
|
||||
from fixtures.utils import lsn_to_hex, mkdir_if_needed
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
# basic test, write something in setup with wal acceptors, ensure that commits
|
||||
# succeed and data is written
|
||||
def test_normal_work(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory):
|
||||
def test_normal_work(zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
wa_factory):
|
||||
zenith_cli.run(["branch", "test_wal_acceptors_normal_work", "empty"])
|
||||
wa_factory.start_n_new(3)
|
||||
pg = postgres.create_start('test_wal_acceptors_normal_work',
|
||||
@@ -29,7 +37,10 @@ def test_normal_work(zenith_cli, pageserver: ZenithPageserver, postgres: Postgre
|
||||
|
||||
# Run page server and multiple acceptors, and multiple compute nodes running
|
||||
# against different timelines.
|
||||
def test_many_timelines(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory):
|
||||
def test_many_timelines(zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
wa_factory):
|
||||
n_timelines = 2
|
||||
|
||||
wa_factory.start_n_new(3)
|
||||
@@ -61,7 +72,10 @@ def test_many_timelines(zenith_cli, pageserver: ZenithPageserver, postgres: Post
|
||||
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
|
||||
# times, with fault_probability chance of getting a wal acceptor down or up
|
||||
# along the way. 2 of 3 are always alive, so the work keeps going.
|
||||
def test_restarts(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
|
||||
def test_restarts(zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
wa_factory: WalAcceptorFactory):
|
||||
fault_probability = 0.01
|
||||
n_inserts = 1000
|
||||
n_acceptors = 3
|
||||
@@ -172,7 +186,11 @@ def stop_value():
|
||||
|
||||
|
||||
# do inserts while concurrently getting up/down subsets of acceptors
|
||||
def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory, stop_value):
|
||||
def test_race_conditions(zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
wa_factory,
|
||||
stop_value):
|
||||
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
@@ -198,3 +216,120 @@ def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
|
||||
|
||||
stop_value.value = 1
|
||||
proc.join()
|
||||
|
||||
|
||||
class ProposerPostgres:
|
||||
"""Object for running safekeepers sync with walproposer"""
|
||||
def __init__(self, pgdata_dir: str, pg_bin: PgBin, timeline_id: str, tenant_id: str):
|
||||
self.pgdata_dir: str = pgdata_dir
|
||||
self.pg_bin: PgBin = pg_bin
|
||||
self.timeline_id: str = timeline_id
|
||||
self.tenant_id: str = tenant_id
|
||||
|
||||
def pg_data_dir_path(self) -> str:
|
||||
""" Path to data directory """
|
||||
return self.pgdata_dir
|
||||
|
||||
def config_file_path(self) -> str:
|
||||
""" Path to postgresql.conf """
|
||||
return os.path.join(self.pgdata_dir, 'postgresql.conf')
|
||||
|
||||
def create_dir_config(self, wal_acceptors: str):
|
||||
""" Create dir and config for running --sync-safekeepers """
|
||||
|
||||
mkdir_if_needed(self.pg_data_dir_path())
|
||||
with open(self.config_file_path(), "w") as f:
|
||||
f.writelines([
|
||||
"synchronous_standby_names = 'walproposer'\n",
|
||||
f"zenith.zenith_timeline = '{self.timeline_id}'\n",
|
||||
f"zenith.zenith_tenant = '{self.tenant_id}'\n",
|
||||
f"wal_acceptors = '{wal_acceptors}'\n",
|
||||
])
|
||||
|
||||
def sync_safekeepers(self) -> str:
|
||||
"""
|
||||
Run 'postgres --sync-safekeepers'.
|
||||
Returns execution result, which is commit_lsn after sync.
|
||||
"""
|
||||
|
||||
command = ["postgres", "--sync-safekeepers"]
|
||||
env = {
|
||||
"PGDATA": self.pg_data_dir_path(),
|
||||
}
|
||||
|
||||
basepath = self.pg_bin.run_capture(command, env)
|
||||
stdout_filename = basepath + '.stdout'
|
||||
|
||||
with open(stdout_filename, 'r') as stdout_f:
|
||||
stdout = stdout_f.read()
|
||||
return stdout.strip("\n ")
|
||||
|
||||
|
||||
# insert wal in all safekeepers and run sync on proposer
|
||||
def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorFactory):
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
timeline_id = uuid.uuid4().hex
|
||||
tenant_id = uuid.uuid4().hex
|
||||
|
||||
# write config for proposer
|
||||
pgdata_dir = os.path.join(repo_dir, "proposer_pgdata")
|
||||
pg = ProposerPostgres(pgdata_dir, pg_bin, timeline_id, tenant_id)
|
||||
pg.create_dir_config(wa_factory.get_connstrs())
|
||||
|
||||
# valid lsn, which is not in the segment start, nor in zero segment
|
||||
epoch_start_lsn = 0x16B9188 # 0/16B9188
|
||||
begin_lsn = epoch_start_lsn
|
||||
|
||||
# append and commit WAL
|
||||
lsn_after_append = []
|
||||
for i in range(3):
|
||||
res = wa_factory.instances[i].append_logical_message(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
{
|
||||
"lm_prefix": "prefix",
|
||||
"lm_message": "message",
|
||||
"set_commit_lsn": True,
|
||||
"term": 2,
|
||||
"begin_lsn": begin_lsn,
|
||||
"epoch_start_lsn": epoch_start_lsn,
|
||||
"truncate_lsn": epoch_start_lsn,
|
||||
},
|
||||
)
|
||||
lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"])
|
||||
lsn_after_append.append(lsn_hex)
|
||||
log.info(f"safekeeper[{i}] lsn after append: {lsn_hex}")
|
||||
|
||||
# run sync safekeepers
|
||||
lsn_after_sync = pg.sync_safekeepers()
|
||||
log.info(f"lsn after sync = {lsn_after_sync}")
|
||||
|
||||
assert all(lsn_after_sync == lsn for lsn in lsn_after_append)
|
||||
|
||||
|
||||
def test_timeline_status(zenith_cli, pageserver, postgres, wa_factory: WalAcceptorFactory):
|
||||
wa_factory.start_n_new(1)
|
||||
|
||||
zenith_cli.run(["branch", "test_timeline_status", "empty"])
|
||||
pg = postgres.create_start('test_timeline_status', wal_acceptors=wa_factory.get_connstrs())
|
||||
|
||||
wa = wa_factory.instances[0]
|
||||
wa_http_cli = wa.http_client()
|
||||
wa_http_cli.check_status()
|
||||
|
||||
# learn zenith timeline from compute
|
||||
tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
|
||||
timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
|
||||
|
||||
# fetch something sensible from status
|
||||
epoch = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch
|
||||
|
||||
pg.safe_psql("create table t(i int)")
|
||||
|
||||
# ensure epoch goes up after reboot
|
||||
pg.stop().start()
|
||||
pg.safe_psql("insert into t values(10)")
|
||||
|
||||
epoch_after_reboot = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch
|
||||
assert epoch_after_reboot > epoch
|
||||
|
||||
168
test_runner/batch_others/test_wal_acceptor_async.py
Normal file
168
test_runner/batch_others/test_wal_acceptor_async.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import asyncio
|
||||
import asyncpg
|
||||
import random
|
||||
|
||||
from fixtures.zenith_fixtures import WalAcceptor, WalAcceptorFactory, ZenithPageserver, PostgresFactory, Postgres
|
||||
from fixtures.log_helper import getLogger
|
||||
from typing import List
|
||||
|
||||
log = getLogger('root.wal_acceptor_async')
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
class BankClient(object):
|
||||
def __init__(self, conn: asyncpg.Connection, n_accounts, init_amount):
|
||||
self.conn: asyncpg.Connection = conn
|
||||
self.n_accounts = n_accounts
|
||||
self.init_amount = init_amount
|
||||
|
||||
async def initdb(self):
|
||||
await self.conn.execute('DROP TABLE IF EXISTS bank_accs')
|
||||
await self.conn.execute('CREATE TABLE bank_accs(uid int primary key, amount int)')
|
||||
await self.conn.execute(
|
||||
'''
|
||||
INSERT INTO bank_accs
|
||||
SELECT *, $1 FROM generate_series(0, $2)
|
||||
''',
|
||||
self.init_amount,
|
||||
self.n_accounts - 1)
|
||||
await self.conn.execute('DROP TABLE IF EXISTS bank_log')
|
||||
await self.conn.execute('CREATE TABLE bank_log(from_uid int, to_uid int, amount int)')
|
||||
|
||||
# TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
|
||||
await self.conn.execute('ALTER TABLE bank_accs SET (autovacuum_enabled = false)')
|
||||
await self.conn.execute('ALTER TABLE bank_log SET (autovacuum_enabled = false)')
|
||||
|
||||
async def check_invariant(self):
|
||||
row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs')
|
||||
assert row['sum'] == self.n_accounts * self.init_amount
|
||||
|
||||
|
||||
async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount):
|
||||
# avoid deadlocks by sorting uids
|
||||
if from_uid > to_uid:
|
||||
from_uid, to_uid, amount = to_uid, from_uid, -amount
|
||||
|
||||
async with conn.transaction():
|
||||
await conn.execute(
|
||||
'UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2',
|
||||
amount,
|
||||
to_uid,
|
||||
)
|
||||
await conn.execute(
|
||||
'UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2',
|
||||
amount,
|
||||
from_uid,
|
||||
)
|
||||
await conn.execute(
|
||||
'INSERT INTO bank_log VALUES ($1, $2, $3)',
|
||||
from_uid,
|
||||
to_uid,
|
||||
amount,
|
||||
)
|
||||
|
||||
|
||||
class WorkerStats(object):
|
||||
def __init__(self, n_workers):
|
||||
self.counters = [0] * n_workers
|
||||
self.running = True
|
||||
|
||||
def reset(self):
|
||||
self.counters = [0] * len(self.counters)
|
||||
|
||||
def inc_progress(self, worker_id):
|
||||
self.counters[worker_id] += 1
|
||||
|
||||
def check_progress(self):
|
||||
log.debug("Workers progress: {}".format(self.counters))
|
||||
|
||||
# every worker should finish at least one tx
|
||||
assert all(cnt > 0 for cnt in self.counters)
|
||||
|
||||
progress = sum(self.counters)
|
||||
log.info('All workers made {} transactions'.format(progress))
|
||||
|
||||
|
||||
async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer):
|
||||
pg_conn = await pg.connect_async()
|
||||
log.debug('Started worker {}'.format(worker_id))
|
||||
|
||||
while stats.running:
|
||||
from_uid = random.randint(0, n_accounts - 1)
|
||||
to_uid = (from_uid + random.randint(1, n_accounts - 1)) % n_accounts
|
||||
amount = random.randint(1, max_transfer)
|
||||
|
||||
await bank_transfer(pg_conn, from_uid, to_uid, amount)
|
||||
stats.inc_progress(worker_id)
|
||||
|
||||
log.debug('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid))
|
||||
|
||||
log.debug('Finished worker {}'.format(worker_id))
|
||||
|
||||
await pg_conn.close()
|
||||
|
||||
|
||||
# This test will run several iterations and check progress in each of them.
|
||||
# On each iteration 1 acceptor is stopped, and 2 others should allow
|
||||
# background workers execute transactions. In the end, state should remain
|
||||
# consistent.
|
||||
async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_workers=10):
|
||||
n_accounts = 100
|
||||
init_amount = 100000
|
||||
max_transfer = 100
|
||||
period_time = 10
|
||||
iterations = 6
|
||||
|
||||
pg_conn = await pg.connect_async()
|
||||
bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount)
|
||||
# create tables and initial balances
|
||||
await bank.initdb()
|
||||
|
||||
stats = WorkerStats(n_workers)
|
||||
workers = []
|
||||
for worker_id in range(n_workers):
|
||||
worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer)
|
||||
workers.append(asyncio.create_task(worker))
|
||||
|
||||
for it in range(iterations):
|
||||
victim = acceptors[it % len(acceptors)]
|
||||
victim.stop()
|
||||
|
||||
# Wait till previous victim recovers so it is ready for the next
|
||||
# iteration by making any writing xact.
|
||||
conn = await pg.connect_async()
|
||||
await conn.execute('UPDATE bank_accs SET amount = amount WHERE uid = 1', timeout=120)
|
||||
await conn.close()
|
||||
|
||||
stats.reset()
|
||||
await asyncio.sleep(period_time)
|
||||
# assert that at least one transaction has completed in every worker
|
||||
stats.check_progress()
|
||||
|
||||
victim.start()
|
||||
|
||||
log.info('Iterations are finished, exiting coroutines...')
|
||||
stats.running = False
|
||||
# await all workers
|
||||
await asyncio.gather(*workers)
|
||||
# assert balances sum hasn't changed
|
||||
await bank.check_invariant()
|
||||
await pg_conn.close()
|
||||
|
||||
|
||||
# restart acceptors one by one, while executing and validating bank transactions
|
||||
def test_restarts_under_load(zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
wa_factory: WalAcceptorFactory):
|
||||
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
zenith_cli.run(["branch", "test_wal_acceptors_restarts_under_load", "empty"])
|
||||
pg = postgres.create_start('test_wal_acceptors_restarts_under_load',
|
||||
wal_acceptors=wa_factory.get_connstrs())
|
||||
|
||||
asyncio.run(run_restarts_under_load(pg, wa_factory.instances))
|
||||
|
||||
# TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
|
||||
pg.stop()
|
||||
@@ -23,8 +23,11 @@ def helper_compare_branch_list(page_server_cur, zenith_cli, initial_tenant: str)
|
||||
|
||||
res = zenith_cli.run(["branch", f"--tenantid={initial_tenant}"])
|
||||
res.check_returncode()
|
||||
branches_cli_with_tenant_arg = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
|
||||
branches_cli_with_tenant_arg = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
|
||||
branches_cli_with_tenant_arg = sorted(
|
||||
map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
|
||||
branches_cli_with_tenant_arg = [
|
||||
b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')
|
||||
]
|
||||
|
||||
assert branches_api == branches_cli == branches_cli_with_tenant_arg
|
||||
|
||||
@@ -54,6 +57,7 @@ def test_cli_branch_list(pageserver: ZenithPageserver, zenith_cli):
|
||||
assert 'test_cli_branch_list_main' in branches_cli
|
||||
assert 'test_cli_branch_list_nested' in branches_cli
|
||||
|
||||
|
||||
def helper_compare_tenant_list(page_server_cur, zenith_cli: ZenithCli):
|
||||
page_server_cur.execute(f'tenant_list')
|
||||
tenants_api = sorted(json.loads(page_server_cur.fetchone()[0]))
|
||||
|
||||
@@ -6,8 +6,14 @@ from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_isolation(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
|
||||
base_dir, capsys):
|
||||
def test_isolation(pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin,
|
||||
zenith_cli,
|
||||
test_output_dir,
|
||||
pg_distrib_dir,
|
||||
base_dir,
|
||||
capsys):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_isolation", "empty"])
|
||||
|
||||
@@ -6,8 +6,14 @@ from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_re
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
|
||||
base_dir, capsys):
|
||||
def test_pg_regress(pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin,
|
||||
zenith_cli,
|
||||
test_output_dir,
|
||||
pg_distrib_dir,
|
||||
base_dir,
|
||||
capsys):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_pg_regress", "empty"])
|
||||
@@ -55,4 +61,4 @@ def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_
|
||||
lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(zenith_cli, pg, lsn, postgres)
|
||||
check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
|
||||
|
||||
@@ -1,13 +1,20 @@
|
||||
import os
|
||||
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
from fixtures.zenith_fixtures import PostgresFactory, check_restored_datadir_content
|
||||
from fixtures.zenith_fixtures import PageserverPort, PostgresFactory, check_restored_datadir_content
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
|
||||
base_dir, capsys):
|
||||
def test_zenith_regress(postgres: PostgresFactory,
|
||||
pg_bin,
|
||||
zenith_cli,
|
||||
test_output_dir,
|
||||
pg_distrib_dir,
|
||||
base_dir,
|
||||
capsys,
|
||||
pageserver_port: PageserverPort):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_zenith_regress", "empty"])
|
||||
@@ -38,7 +45,7 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
|
||||
'--inputdir={}'.format(src_path),
|
||||
]
|
||||
|
||||
print(pg_regress_command)
|
||||
log.info(pg_regress_command)
|
||||
env = {
|
||||
'PGPORT': str(pg.port),
|
||||
'PGUSER': pg.username,
|
||||
@@ -56,4 +63,4 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
|
||||
lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(zenith_cli, pg, lsn, postgres)
|
||||
check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver_port.pg)
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from pprint import pprint
|
||||
|
||||
import os
|
||||
import re
|
||||
import timeit
|
||||
@@ -26,7 +24,6 @@ from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
|
||||
from typing_extensions import Literal
|
||||
|
||||
from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
|
||||
|
||||
"""
|
||||
This file contains fixtures for micro-benchmarks.
|
||||
|
||||
@@ -57,7 +54,6 @@ in the test initialization, or measure disk usage after the test query.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
# All the results are collected in this list, as a tuple:
|
||||
# (test_name: str, metric_name: str, metric_value: float, unit: str)
|
||||
#
|
||||
@@ -67,6 +63,7 @@ in the test initialization, or measure disk usage after the test query.
|
||||
global zenbenchmark_results
|
||||
zenbenchmark_results = []
|
||||
|
||||
|
||||
class ZenithBenchmarkResults:
|
||||
""" An object for recording benchmark results. """
|
||||
def __init__(self):
|
||||
@@ -79,6 +76,7 @@ class ZenithBenchmarkResults:
|
||||
|
||||
self.results.append((test_name, metric_name, metric_value, unit))
|
||||
|
||||
|
||||
# Session scope fixture that initializes the results object
|
||||
@pytest.fixture(autouse=True, scope='session')
|
||||
def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
|
||||
@@ -90,6 +88,7 @@ def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
|
||||
|
||||
yield zenbenchmark_results
|
||||
|
||||
|
||||
class ZenithBenchmarker:
|
||||
"""
|
||||
An object for recording benchmark results. This is created for each test
|
||||
@@ -105,7 +104,6 @@ class ZenithBenchmarker:
|
||||
"""
|
||||
self.results.record(self.request.node.name, metric_name, metric_value, unit)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def record_duration(self, metric_name):
|
||||
"""
|
||||
@@ -136,9 +134,34 @@ class ZenithBenchmarker:
|
||||
# The metric should be an integer, as it's a number of bytes. But in general
|
||||
# all prometheus metrics are floats. So to be pedantic, read it as a float
|
||||
# and round to integer.
|
||||
matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics)
|
||||
matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$',
|
||||
all_metrics,
|
||||
re.MULTILINE)
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
def get_peak_mem(self, pageserver) -> int:
|
||||
"""
|
||||
Fetch the "maxrss" metric from the pageserver
|
||||
"""
|
||||
# Fetch all the exposed prometheus metrics from page server
|
||||
all_metrics = pageserver.http_client().get_metrics()
|
||||
# See comment in get_io_writes()
|
||||
matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, re.MULTILINE)
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):
|
||||
"""
|
||||
Calculate the on-disk size of a timeline
|
||||
"""
|
||||
path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
|
||||
|
||||
totalbytes = 0
|
||||
for root, dirs, files in os.walk(path):
|
||||
for name in files:
|
||||
totalbytes += os.path.getsize(os.path.join(root, name))
|
||||
|
||||
return totalbytes
|
||||
|
||||
@contextmanager
|
||||
def record_pageserver_writes(self, pageserver, metric_name):
|
||||
"""
|
||||
@@ -148,7 +171,11 @@ class ZenithBenchmarker:
|
||||
yield
|
||||
after = self.get_io_writes(pageserver)
|
||||
|
||||
self.results.record(self.request.node.name, metric_name, round((after - before) / (1024 * 1024)), 'MB')
|
||||
self.results.record(self.request.node.name,
|
||||
metric_name,
|
||||
round((after - before) / (1024 * 1024)),
|
||||
'MB')
|
||||
|
||||
|
||||
@pytest.fixture(scope='function')
|
||||
def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
|
||||
@@ -162,9 +189,7 @@ def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
|
||||
|
||||
# Hook to print the results at the end
|
||||
@pytest.hookimpl(hookwrapper=True)
|
||||
def pytest_terminal_summary(
|
||||
terminalreporter: TerminalReporter, exitstatus: int, config: Config
|
||||
):
|
||||
def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config):
|
||||
yield
|
||||
|
||||
global zenbenchmark_results
|
||||
|
||||
45
test_runner/fixtures/log_helper.py
Normal file
45
test_runner/fixtures/log_helper.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import logging
|
||||
import logging.config
|
||||
"""
|
||||
This file configures logging to use in python tests.
|
||||
Logs are automatically captured and shown in their
|
||||
own section after all tests are executed.
|
||||
|
||||
To see logs for all (even successful) tests, run
|
||||
pytest with the following command:
|
||||
- `pipenv run pytest -n8 -rA`
|
||||
|
||||
Other log config can be set in pytest.ini file.
|
||||
You can add `log_cli = true` to it to watch
|
||||
logs in real time.
|
||||
|
||||
To get more info about logging with pytest, see
|
||||
https://docs.pytest.org/en/6.2.x/logging.html
|
||||
"""
|
||||
|
||||
# this config is only used for default log levels,
|
||||
# log format is specified in pytest.ini file
|
||||
LOGGING = {
|
||||
"version": 1,
|
||||
"loggers": {
|
||||
"root": {
|
||||
"level": "INFO"
|
||||
},
|
||||
"root.wal_acceptor_async": {
|
||||
"level": "INFO" # a lot of logs on DEBUG level
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def getLogger(name='root') -> logging.Logger:
|
||||
"""Method to get logger for tests.
|
||||
|
||||
Should be used to get correctly initialized logger. """
|
||||
return logging.getLogger(name)
|
||||
|
||||
|
||||
# default logger for tests
|
||||
log = getLogger()
|
||||
|
||||
logging.config.dictConfig(LOGGING)
|
||||
@@ -2,6 +2,7 @@ import os
|
||||
import subprocess
|
||||
|
||||
from typing import Any, List
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
def get_self_dir() -> str:
|
||||
@@ -14,13 +15,14 @@ def mkdir_if_needed(path: str) -> None:
|
||||
|
||||
Note this won't try to create intermediate directories.
|
||||
"""
|
||||
if os.path.exists(path):
|
||||
assert os.path.isdir(path)
|
||||
return
|
||||
os.mkdir(path)
|
||||
try:
|
||||
os.mkdir(path)
|
||||
except FileExistsError:
|
||||
pass
|
||||
assert os.path.isdir(path)
|
||||
|
||||
|
||||
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:
|
||||
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
||||
""" Run a process and capture its output
|
||||
|
||||
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
|
||||
@@ -28,6 +30,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:
|
||||
counter.
|
||||
|
||||
If those files already exist, we will overwrite them.
|
||||
Returns basepath for files with captured output.
|
||||
"""
|
||||
assert type(cmd) is list
|
||||
base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
|
||||
@@ -37,9 +40,11 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:
|
||||
|
||||
with open(stdout_filename, 'w') as stdout_f:
|
||||
with open(stderr_filename, 'w') as stderr_f:
|
||||
print('(capturing output to "{}.stdout")'.format(base))
|
||||
log.info('(capturing output to "{}.stdout")'.format(base))
|
||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||
|
||||
return basepath
|
||||
|
||||
|
||||
_global_counter = 0
|
||||
|
||||
@@ -53,3 +58,8 @@ def global_counter() -> int:
|
||||
global _global_counter
|
||||
_global_counter += 1
|
||||
return _global_counter
|
||||
|
||||
|
||||
def lsn_to_hex(num: int) -> str:
|
||||
""" Convert lsn from int to standard hex notation. """
|
||||
return "{:X}/{:X}".format(num >> 32, num & 0xffffffff)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
60
test_runner/performance/test_bulk_insert.py
Normal file
60
test_runner/performance/test_bulk_insert.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
|
||||
#
|
||||
# Run bulk INSERT test.
|
||||
#
|
||||
# Collects metrics:
|
||||
#
|
||||
# 1. Time to INSERT 5 million rows
|
||||
# 2. Disk writes
|
||||
# 3. Disk space used
|
||||
# 4. Peak memory usage
|
||||
#
|
||||
def test_bulk_insert(postgres: PostgresFactory,
|
||||
pageserver: ZenithPageserver,
|
||||
pg_bin,
|
||||
zenith_cli,
|
||||
zenbenchmark,
|
||||
repo_dir: str):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_bulk_insert", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_bulk_insert')
|
||||
log.info("postgres is running on 'test_bulk_insert' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = pageserver.connect()
|
||||
pscur = psconn.cursor()
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
cur.execute("create table huge (i int, j int);")
|
||||
|
||||
# Run INSERT, recording the time and I/O it takes
|
||||
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
|
||||
with zenbenchmark.record_duration('insert'):
|
||||
cur.execute("insert into huge values (generate_series(1, 5000000), 0);")
|
||||
|
||||
# Flush the layers from memory to disk. This is included in the reported
|
||||
# time and I/O
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
# Record peak memory usage
|
||||
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir,
|
||||
pageserver.initial_tenant,
|
||||
timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
|
||||
61
test_runner/performance/test_bulk_tenant_create.py
Normal file
61
test_runner/performance/test_bulk_tenant_create.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import timeit
|
||||
import pytest
|
||||
|
||||
from fixtures.zenith_fixtures import (
|
||||
TenantFactory,
|
||||
ZenithCli,
|
||||
PostgresFactory,
|
||||
)
|
||||
|
||||
pytest_plugins = ("fixtures.benchmark_fixture")
|
||||
|
||||
# Run bulk tenant creation test.
|
||||
#
|
||||
# Collects metrics:
|
||||
#
|
||||
# 1. Time to create {1,10,50} tenants
|
||||
# 2. Average creation time per tenant
|
||||
|
||||
|
||||
@pytest.mark.parametrize('tenants_count', [1, 5, 10])
|
||||
@pytest.mark.parametrize('use_wal_acceptors', ['with_wa', 'without_wa'])
|
||||
def test_bulk_tenant_create(
|
||||
zenith_cli: ZenithCli,
|
||||
tenant_factory: TenantFactory,
|
||||
postgres: PostgresFactory,
|
||||
wa_factory,
|
||||
use_wal_acceptors: str,
|
||||
tenants_count: int,
|
||||
zenbenchmark,
|
||||
):
|
||||
"""Measure tenant creation time (with and without wal acceptors)"""
|
||||
|
||||
time_slices = []
|
||||
|
||||
for i in range(tenants_count):
|
||||
start = timeit.default_timer()
|
||||
|
||||
tenant = tenant_factory.create()
|
||||
zenith_cli.run([
|
||||
"branch",
|
||||
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
|
||||
"main",
|
||||
f"--tenantid={tenant}"
|
||||
])
|
||||
|
||||
if use_wal_acceptors == 'with_wa':
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
pg_tenant = postgres.create_start(
|
||||
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
|
||||
None, # branch name, None means same as node name
|
||||
tenant,
|
||||
wal_acceptors=wa_factory.get_connstrs() if use_wal_acceptors == 'with_wa' else None,
|
||||
)
|
||||
|
||||
end = timeit.default_timer()
|
||||
time_slices.append(end - start)
|
||||
|
||||
pg_tenant.stop()
|
||||
|
||||
zenbenchmark.record('tenant_creation_time', sum(time_slices) / len(time_slices), 's')
|
||||
61
test_runner/performance/test_gist_build.py
Normal file
61
test_runner/performance/test_gist_build.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
|
||||
#
|
||||
# Test buffering GisT build. It WAL-logs the whole relation, in 32-page chunks.
|
||||
# As of this writing, we're duplicate those giant WAL records for each page,
|
||||
# which makes the delta layer about 32x larger than it needs to be.
|
||||
#
|
||||
def test_gist_buffering_build(postgres: PostgresFactory,
|
||||
pageserver: ZenithPageserver,
|
||||
pg_bin,
|
||||
zenith_cli,
|
||||
zenbenchmark,
|
||||
repo_dir: str):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_gist_buffering_build", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_gist_buffering_build')
|
||||
log.info("postgres is running on 'test_gist_buffering_build' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = pageserver.connect()
|
||||
pscur = psconn.cursor()
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
# Create test table.
|
||||
cur.execute("create table gist_point_tbl(id int4, p point)")
|
||||
cur.execute(
|
||||
"insert into gist_point_tbl select g, point(g, g) from generate_series(1, 1000000) g;"
|
||||
)
|
||||
|
||||
# Build the index.
|
||||
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
|
||||
with zenbenchmark.record_duration('build'):
|
||||
cur.execute(
|
||||
"create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)"
|
||||
)
|
||||
|
||||
# Flush the layers from memory to disk. This is included in the reported
|
||||
# time and I/O
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 1000000")
|
||||
|
||||
# Record peak memory usage
|
||||
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir,
|
||||
pageserver.initial_tenant,
|
||||
timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
|
||||
@@ -1,21 +1,10 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
|
||||
path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
|
||||
|
||||
totalbytes = 0
|
||||
for root, dirs, files in os.walk(path):
|
||||
for name in files:
|
||||
totalbytes += os.path.getsize(os.path.join(root, name))
|
||||
|
||||
if 'wal' in dirs:
|
||||
dirs.remove('wal') # don't visit 'wal' subdirectory
|
||||
|
||||
return totalbytes
|
||||
|
||||
#
|
||||
# Run a very short pgbench test.
|
||||
@@ -26,16 +15,21 @@ def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
|
||||
# 2. Time to run 5000 pgbench transactions
|
||||
# 3. Disk space used
|
||||
#
|
||||
def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
|
||||
def test_pgbench(postgres: PostgresFactory,
|
||||
pageserver: ZenithPageserver,
|
||||
pg_bin,
|
||||
zenith_cli,
|
||||
zenbenchmark,
|
||||
repo_dir: str):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_pgbench_perf", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_pgbench_perf')
|
||||
print("postgres is running on 'test_pgbench_perf' branch")
|
||||
log.info("postgres is running on 'test_pgbench_perf' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = pageserver.connect();
|
||||
psconn = pageserver.connect()
|
||||
pscur = psconn.cursor()
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
@@ -64,5 +58,5 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
|
||||
|
||||
83
test_runner/performance/test_write_amplification.py
Normal file
83
test_runner/performance/test_write_amplification.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# Demonstrate Write Amplification with naive oldest-first layer checkpointing
|
||||
# algorithm.
|
||||
#
|
||||
# In each iteration of the test, we create a new table that's slightly under 10
|
||||
# MB in size (10 MB is the current "segment size" used by the page server). Then
|
||||
# we make a tiny update to all the tables already created. This creates a WAL
|
||||
# pattern where you have a lot of updates on one segment (the newly created
|
||||
# one), alternating with a small updates on all relations. This is the worst
|
||||
# case scenario for the naive checkpointing policy where we write out the layers
|
||||
# in LSN order, writing the oldest layer first. That creates a new 10 MB image
|
||||
# layer to be created for each of those small updates. This is the Write
|
||||
# Amplification problem at its finest.
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
|
||||
def test_write_amplification(postgres: PostgresFactory,
|
||||
pageserver: ZenithPageserver,
|
||||
pg_bin,
|
||||
zenith_cli,
|
||||
zenbenchmark,
|
||||
repo_dir: str):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_write_amplification", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_write_amplification')
|
||||
log.info("postgres is running on 'test_write_amplification' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = pageserver.connect()
|
||||
pscur = psconn.cursor()
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
|
||||
with zenbenchmark.record_duration('run'):
|
||||
|
||||
# NOTE: Because each iteration updates every table already created,
|
||||
# the runtime and write amplification is O(n^2), where n is the
|
||||
# number of iterations.
|
||||
for i in range(25):
|
||||
cur.execute(f'''
|
||||
CREATE TABLE tbl{i} AS
|
||||
SELECT g as i, 'long string to consume some space' || g as t
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
cur.execute(f"create index on tbl{i} (i);")
|
||||
for j in range(1, i):
|
||||
cur.execute(f"delete from tbl{j} where i = {i}")
|
||||
|
||||
# Force checkpointing. As of this writing, we don't have
|
||||
# a back-pressure mechanism, and the page server cannot
|
||||
# keep up digesting and checkpointing the WAL at the
|
||||
# rate that it is generated. If we don't force a
|
||||
# checkpoint, the WAL will just accumulate in memory
|
||||
# until you hit OOM error. So in effect, we use much
|
||||
# more memory to hold the incoming WAL, and write them
|
||||
# out in larger batches than we'd really want. Using
|
||||
# more memory hides the write amplification problem this
|
||||
# test tries to demonstrate.
|
||||
#
|
||||
# The write amplification problem is real, and using
|
||||
# more memory isn't the right solution. We could
|
||||
# demonstrate the effect also by generating the WAL
|
||||
# slower, adding some delays in this loop. But forcing
|
||||
# the the checkpointing and GC makes the test go faster,
|
||||
# with the same total I/O effect.
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir,
|
||||
pageserver.initial_tenant,
|
||||
timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user