mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 19:02:56 +00:00
Compare commits
10 Commits
try-no-loc
...
local_file
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
458ca82d75 | ||
|
|
0dface838d | ||
|
|
f7878c5157 | ||
|
|
53c870b8e7 | ||
|
|
c23b65914e | ||
|
|
b51d3f6b2b | ||
|
|
137472db91 | ||
|
|
f817985f2b | ||
|
|
2e94e1428e | ||
|
|
9a486ca109 |
@@ -24,12 +24,6 @@ jobs:
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: zenith-build-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
enum: ["debug", "release"]
|
||||
environment:
|
||||
BUILD_TYPE: << parameters.build_type >>
|
||||
steps:
|
||||
# Checkout the git repo (circleci doesn't have a flag to enable submodules here)
|
||||
- checkout
|
||||
@@ -45,7 +39,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
# FIXME We could cache our own docker container, instead of installing packages every time.
|
||||
- run:
|
||||
@@ -65,12 +59,12 @@ jobs:
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
# "depth 1" saves some time by not cloning the whole repo
|
||||
git submodule update --init --depth 1
|
||||
make postgres -j8
|
||||
make postgres
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save postgres cache
|
||||
key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
@@ -102,7 +96,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
@@ -241,21 +235,17 @@ jobs:
|
||||
if << parameters.run_in_parallel >>; then
|
||||
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||
fi;
|
||||
./netstat-script.sh &
|
||||
NS_PID=$!
|
||||
# Run the tests.
|
||||
#
|
||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||
# in its "Tests" tab in the results page.
|
||||
# -s prevents pytest from capturing output, which helps to see
|
||||
# what's going on if the test hangs
|
||||
# --verbose prints name of each test (helpful when there are
|
||||
# multiple tests in one file)
|
||||
# -rA prints summary in the end
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||
# in parallel and logs are mixed between different tests
|
||||
pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
kill $NS_PID
|
||||
awk '/===/ {if (count) print count; print; count=0; next} {count++} END {print count}' $TEST_OUTPUT/netstat.stdout > $TEST_OUTPUT/netstat_stats.stdout
|
||||
pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short -s --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
- run:
|
||||
# CircleCI artifacts are preserved one file at a time, so skipping
|
||||
# this step isn't a good idea. If you want to extract the
|
||||
@@ -264,7 +254,7 @@ jobs:
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
@@ -329,7 +319,8 @@ jobs:
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"zenith-remote-ci\",
|
||||
\"commit_hash\": \"$CIRCLE_SHA1\",
|
||||
\"remote_repo\": \"$LOCAL_REPO\"
|
||||
\"remote_repo\": \"$LOCAL_REPO\",
|
||||
\"zenith_image_branch\": \"$CIRCLE_BRANCH\"
|
||||
}
|
||||
}"
|
||||
|
||||
@@ -337,18 +328,14 @@ workflows:
|
||||
build_and_test:
|
||||
jobs:
|
||||
- check-codestyle
|
||||
- build-postgres:
|
||||
name: build-postgres-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
- build-postgres
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
requires:
|
||||
- build-postgres-<< matrix.build_type >>
|
||||
- build-postgres
|
||||
- run-pytest:
|
||||
name: pg_regress-tests-<< matrix.build_type >>
|
||||
matrix:
|
||||
|
||||
@@ -2,17 +2,12 @@
|
||||
**/__pycache__
|
||||
**/.pytest_cache
|
||||
|
||||
.git
|
||||
target
|
||||
tmp_check
|
||||
tmp_install
|
||||
tmp_check_cli
|
||||
test_output
|
||||
.vscode
|
||||
.zenith
|
||||
integration_tests/.zenith
|
||||
.mypy_cache
|
||||
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
|
||||
/target
|
||||
/tmp_check
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
/test_output
|
||||
/.vscode
|
||||
/.zenith
|
||||
/integration_tests/.zenith
|
||||
/Dockerfile
|
||||
|
||||
275
Cargo.lock
generated
275
Cargo.lock
generated
@@ -26,21 +26,18 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ansi_term"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.42"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "595d3cfa7a60d4555cb5067b99f07142a08ea778de5cf993f7b75c7d8fabc486"
|
||||
|
||||
[[package]]
|
||||
name = "arc-swap"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e906254e445520903e7fc9da4f709886c84ae4bc4ddaf0e093188d66df4dc820"
|
||||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.50"
|
||||
@@ -301,7 +298,7 @@ version = "2.33.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
|
||||
dependencies = [
|
||||
"ansi_term 0.11.0",
|
||||
"ansi_term",
|
||||
"atty",
|
||||
"bitflags",
|
||||
"strsim",
|
||||
@@ -310,26 +307,6 @@ dependencies = [
|
||||
"vec_map",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_format"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4556f63e28a78fa5e6f310cfea5647a25636def49a338ab69e33b34a3382057b"
|
||||
dependencies = [
|
||||
"const_format_proc_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_format_proc_macros"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "552782506c398da94466b364973b563887e0ca078bf33a76d4163736165e3594"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "control_plane"
|
||||
version = "0.1.0"
|
||||
@@ -390,6 +367,26 @@ dependencies = [
|
||||
"rustc_version",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crypto-mac"
|
||||
version = "0.10.0"
|
||||
@@ -428,6 +425,16 @@ dependencies = [
|
||||
"dirs-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-next"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"dirs-sys-next",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys"
|
||||
version = "0.3.6"
|
||||
@@ -439,6 +446,17 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys-next"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"redox_users",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dlv-list"
|
||||
version = "0.2.3"
|
||||
@@ -918,15 +936,6 @@ dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
version = "0.1.8"
|
||||
@@ -1170,12 +1179,10 @@ dependencies = [
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
"futures",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
@@ -1191,12 +1198,10 @@ dependencies = [
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"signal-hook",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"toml",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
@@ -1401,7 +1406,6 @@ dependencies = [
|
||||
"hex",
|
||||
"md5",
|
||||
"rand",
|
||||
"reqwest",
|
||||
"rustls",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -1504,15 +1508,6 @@ dependencies = [
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
|
||||
dependencies = [
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.25"
|
||||
@@ -1671,6 +1666,12 @@ dependencies = [
|
||||
"webpki",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.5"
|
||||
@@ -1828,32 +1829,12 @@ dependencies = [
|
||||
"opaque-debug",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42a568c8f2cd051a4d283bd6eb0343ac214c1b0f1ac19f93e1175b2dee38c73d"
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook"
|
||||
version = "0.3.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c98891d737e271a2954825ef19e46bd16bdb98e2746f2eec4f7a4ef7946efd1"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"signal-hook-registry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.4.0"
|
||||
@@ -1886,6 +1867,59 @@ version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527"
|
||||
|
||||
[[package]]
|
||||
name = "slog"
|
||||
version = "2.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06"
|
||||
|
||||
[[package]]
|
||||
name = "slog-async"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c60813879f820c85dbc4eabf3269befe374591289019775898d56a81a804fbdc"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"slog",
|
||||
"take_mut",
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-scope"
|
||||
version = "4.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f95a4b4c3274cd2869549da82b57ccc930859bdbf5bcea0424bc5f140b3c786"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"lazy_static",
|
||||
"slog",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-stdlog"
|
||||
version = "4.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8228ab7302adbf4fcb37e66f3cda78003feb521e7fd9e3847ec117a7784d0f5a"
|
||||
dependencies = [
|
||||
"log",
|
||||
"slog",
|
||||
"slog-scope",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-term"
|
||||
version = "2.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95c1e7e5aab61ced6006149ea772770b84a0d16ce0f7885def313e4829946d76"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"chrono",
|
||||
"slog",
|
||||
"term",
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.6.1"
|
||||
@@ -1941,6 +1975,12 @@ dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "take_mut"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
|
||||
|
||||
[[package]]
|
||||
name = "tap"
|
||||
version = "1.0.1"
|
||||
@@ -1972,6 +2012,17 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "term"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f"
|
||||
dependencies = [
|
||||
"dirs-next",
|
||||
"rustversion",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.1.2"
|
||||
@@ -2149,79 +2200,24 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.29"
|
||||
version = "0.1.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "375a639232caf30edfc78e8d89b2d4c375515393e7af7e16f01cd96917fb2105"
|
||||
checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4f480b8f81512e825f337ad51e94c1eb5d3bbdf2b363dcd01e2b19a9ffe3f8e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.21"
|
||||
version = "0.1.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4"
|
||||
checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-log"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-serde"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.2.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e0d2eaa99c3c2e41547cfa109e910a68ea03823cccad4a0525dcbc9b01e8c71"
|
||||
dependencies = [
|
||||
"ansi_term 0.12.1",
|
||||
"chrono",
|
||||
"lazy_static",
|
||||
"matchers",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.3"
|
||||
@@ -2320,13 +2316,11 @@ dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"clap",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
"fs2",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"pageserver",
|
||||
@@ -2334,15 +2328,12 @@ dependencies = [
|
||||
"postgres-protocol",
|
||||
"postgres_ffi",
|
||||
"regex",
|
||||
"routerify",
|
||||
"rust-s3",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
]
|
||||
|
||||
@@ -2563,7 +2554,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"prometheus",
|
||||
]
|
||||
|
||||
@@ -2588,12 +2578,13 @@ dependencies = [
|
||||
"rustls-split",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"slog",
|
||||
"slog-async",
|
||||
"slog-scope",
|
||||
"slog-stdlog",
|
||||
"slog-term",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-log",
|
||||
"tracing-subscriber",
|
||||
"webpki",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
|
||||
38
Dockerfile
38
Dockerfile
@@ -10,21 +10,39 @@ FROM zenithdb/build:buster AS pg-build
|
||||
WORKDIR /zenith
|
||||
COPY ./vendor/postgres vendor/postgres
|
||||
COPY ./Makefile Makefile
|
||||
ENV BUILD_TYPE release
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
|
||||
RUN rm -rf postgres_install/build
|
||||
|
||||
#
|
||||
# Calculate cargo dependencies.
|
||||
# This will always run, but only generate recipe.json with list of dependencies without
|
||||
# installing them.
|
||||
#
|
||||
FROM zenithdb/build:buster AS cargo-deps-inspect
|
||||
WORKDIR /zenith
|
||||
COPY . .
|
||||
RUN cargo chef prepare --recipe-path /zenith/recipe.json
|
||||
|
||||
#
|
||||
# Build cargo dependencies.
|
||||
# This temp cantainner should be rebuilt only if recipe.json was changed.
|
||||
#
|
||||
FROM zenithdb/build:buster AS deps-build
|
||||
WORKDIR /zenith
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY --from=cargo-deps-inspect /usr/local/cargo/bin/cargo-chef /usr/local/cargo/bin/
|
||||
COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
|
||||
RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
|
||||
|
||||
#
|
||||
# Build zenith binaries
|
||||
#
|
||||
# TODO: build cargo deps as separate layer. We used cargo-chef before but that was
|
||||
# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
|
||||
#
|
||||
FROM zenithdb/build:buster AS build
|
||||
WORKDIR /zenith
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
|
||||
COPY . .
|
||||
# Copy cached dependencies
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY --from=deps-build /zenith/target target
|
||||
COPY --from=deps-build /usr/local/cargo/ /usr/local/cargo/
|
||||
RUN cargo build --release
|
||||
|
||||
#
|
||||
@@ -33,13 +51,11 @@ RUN cargo build --release
|
||||
FROM debian:buster-slim
|
||||
WORKDIR /data
|
||||
|
||||
RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \
|
||||
RUN apt-get update && apt-get -yq install librocksdb-dev libseccomp-dev openssl && \
|
||||
mkdir zenith_install
|
||||
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
|
||||
# TODO: temporary alias for compatibility, see https://github.com/zenithdb/zenith/pull/740
|
||||
RUN ln -s /usr/local/bin/safekeeper /usr/local/bin/wal_acceptor
|
||||
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/proxy /usr/local/bin
|
||||
COPY --from=pg-build /zenith/tmp_install postgres_install
|
||||
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||
|
||||
@@ -81,9 +81,7 @@ FROM alpine:3.13
|
||||
RUN apk add --update openssl build-base libseccomp-dev
|
||||
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
|
||||
# TODO: temporary alias for compatibility, see https://github.com/zenithdb/zenith/pull/740
|
||||
RUN ln -s /usr/local/bin/safekeeper /usr/local/bin/wal_acceptor
|
||||
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/proxy /usr/local/bin
|
||||
COPY --from=pg-build /zenith/tmp_install /usr/local
|
||||
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||
|
||||
@@ -9,7 +9,7 @@ WORKDIR /zenith
|
||||
# Install postgres and zenith build dependencies
|
||||
# clang is for rocksdb
|
||||
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libseccomp-dev pkg-config libssl-dev clang
|
||||
libseccomp-dev pkg-config libssl-dev librocksdb-dev clang
|
||||
|
||||
# Install rust tools
|
||||
RUN rustup component add clippy && cargo install cargo-audit
|
||||
RUN rustup component add clippy && cargo install cargo-chef cargo-audit
|
||||
|
||||
58
Makefile
58
Makefile
@@ -6,55 +6,34 @@ else
|
||||
SECCOMP =
|
||||
endif
|
||||
|
||||
#
|
||||
# We differentiate between release / debug build types using the BUILD_TYPE
|
||||
# environment variable.
|
||||
#
|
||||
BUILD_TYPE ?= debug
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug
|
||||
PG_CFLAGS = -O2 -g3 $(CFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 $(CFLAGS)
|
||||
else
|
||||
$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
# Choose whether we should be silent or verbose
|
||||
CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
|
||||
# Fix for a corner case when make doesn't pass a jobserver
|
||||
CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
||||
|
||||
# This option has a side effect of passing make jobserver to cargo.
|
||||
# However, we shouldn't do this if `make -n` (--dry-run) has been asked.
|
||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||
# Force cargo not to print progress bar
|
||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
|
||||
#
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
.PHONY: all
|
||||
all: zenith postgres
|
||||
|
||||
# We don't want to run 'cargo build' in parallel with the postgres build,
|
||||
# because interleaving cargo build output with postgres build output looks
|
||||
# confusing. Also, 'cargo build' is parallel on its own, so it would be too
|
||||
# much parallelism. (Recursive invocation of postgres target still gets any
|
||||
# '-j' flag from the command line, so 'make -j' is still useful.)
|
||||
.NOTPARALLEL:
|
||||
|
||||
### Zenith Rust bits
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: zenith
|
||||
zenith: postgres-headers
|
||||
+@echo "Compiling Zenith"
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||
cargo build
|
||||
|
||||
### PostgreSQL parts
|
||||
tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) \
|
||||
../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
|
||||
--enable-cassert \
|
||||
--enable-debug \
|
||||
--enable-depend \
|
||||
$(SECCOMP) \
|
||||
--prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
@@ -68,10 +47,10 @@ postgres-headers: postgres-configure
|
||||
+@echo "Installing PostgreSQL headers"
|
||||
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
|
||||
|
||||
|
||||
# Compile and install PostgreSQL and contrib/zenith
|
||||
.PHONY: postgres
|
||||
postgres: postgres-configure \
|
||||
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
|
||||
postgres: postgres-configure
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
||||
+@echo "Compiling contrib/zenith"
|
||||
@@ -79,21 +58,18 @@ postgres: postgres-configure \
|
||||
+@echo "Compiling contrib/zenith_test_utils"
|
||||
$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean:
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean:
|
||||
cd tmp_install/build && $(MAKE) clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
cd tmp_install/build && ${MAKE} clean
|
||||
cargo clean
|
||||
|
||||
# This removes everything
|
||||
.PHONY: distclean
|
||||
distclean:
|
||||
rm -rf tmp_install
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
cargo clean
|
||||
|
||||
.PHONY: fmt
|
||||
fmt:
|
||||
|
||||
@@ -25,7 +25,7 @@ Pageserver consists of:
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev
|
||||
libssl-dev clang
|
||||
```
|
||||
|
||||
[Rust] 1.52 or later is also required.
|
||||
@@ -108,13 +108,6 @@ postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
```
|
||||
|
||||
6. If you want to run tests afterwards (see below), you have to stop pageserver and all postgres instances you have just started:
|
||||
```sh
|
||||
> ./target/debug/zenith pg stop migration_check
|
||||
> ./target/debug/zenith pg stop main
|
||||
> ./target/debug/zenith stop
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs::{self, File};
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{collections::BTreeMap, path::PathBuf};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
@@ -18,7 +19,6 @@ use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage::PageServerNode;
|
||||
|
||||
//
|
||||
@@ -144,25 +144,76 @@ impl PostgresNode {
|
||||
);
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
|
||||
static ref CONF_TIMELINE_RE: Regex =
|
||||
Regex::new(r"(?m)^\s*zenith.zenith_timeline\s*=\s*'(\w+)'\s*$").unwrap();
|
||||
static ref CONF_TENANT_RE: Regex =
|
||||
Regex::new(r"(?m)^\s*zenith.zenith_tenant\s*=\s*'(\w+)'\s*$").unwrap();
|
||||
}
|
||||
|
||||
// parse data directory name
|
||||
let fname = entry.file_name();
|
||||
let name = fname.to_str().unwrap().to_string();
|
||||
|
||||
// Read config file into memory
|
||||
// find out tcp port in config file
|
||||
let cfg_path = entry.path().join("postgresql.conf");
|
||||
let cfg_path_str = cfg_path.to_string_lossy();
|
||||
let mut conf_file = File::open(&cfg_path)
|
||||
.with_context(|| format!("failed to open config file in {}", cfg_path_str))?;
|
||||
let conf = PostgresConf::read(&mut conf_file)
|
||||
.with_context(|| format!("failed to read config file in {}", cfg_path_str))?;
|
||||
let config = fs::read_to_string(cfg_path.clone()).with_context(|| {
|
||||
format!(
|
||||
"failed to read config file in {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Read a few options from the config file
|
||||
let context = format!("in config file {}", cfg_path_str);
|
||||
let port: u16 = conf.parse_field("port", &context)?;
|
||||
let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
|
||||
let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
|
||||
// parse port
|
||||
let err_msg = format!(
|
||||
"failed to find port definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let port: u16 = CONF_PORT_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
let uses_wal_proposer = conf.get("wal_acceptors").is_some();
|
||||
// parse timeline
|
||||
let err_msg = format!(
|
||||
"failed to find timeline definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let timelineid: ZTimelineId = CONF_TIMELINE_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
// parse tenant
|
||||
let err_msg = format!(
|
||||
"failed to find tenant definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let tenantid = CONF_TENANT_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
let uses_wal_proposer = config.contains("wal_acceptors");
|
||||
|
||||
// ok now
|
||||
Ok(PostgresNode {
|
||||
@@ -257,58 +308,74 @@ impl PostgresNode {
|
||||
// Connect to a page server, get base backup, and untar it to initialize a
|
||||
// new data directory
|
||||
fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
|
||||
let mut conf = PostgresConf::new();
|
||||
conf.append("max_wal_senders", "10");
|
||||
File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;
|
||||
|
||||
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
|
||||
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
|
||||
conf.append("wal_log_hints", "on");
|
||||
conf.append("max_replication_slots", "10");
|
||||
conf.append("hot_standby", "on");
|
||||
conf.append("shared_buffers", "1MB");
|
||||
conf.append("fsync", "off");
|
||||
conf.append("max_connections", "100");
|
||||
conf.append("wal_sender_timeout", "0");
|
||||
conf.append("wal_level", "replica");
|
||||
conf.append("listen_addresses", &self.address.ip().to_string());
|
||||
conf.append("port", &self.address.port().to_string());
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"max_wal_senders = 10\n\
|
||||
wal_log_hints = on\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
fsync = off\n\
|
||||
max_connections = 100\n\
|
||||
wal_sender_timeout = 0\n\
|
||||
wal_level = replica\n\
|
||||
zenith.file_cache_size = 4096\n\
|
||||
zenith.file_cache_path = '/tmp/file.cache'\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n",
|
||||
address = self.address.ip(),
|
||||
port = self.address.port()
|
||||
),
|
||||
)?;
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeeper or
|
||||
// page server yet. (gh issue #349)
|
||||
conf.append("wal_keep_size", "10TB");
|
||||
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;
|
||||
|
||||
// Configure the node to fetch pages from pageserver
|
||||
let pageserver_connstr = {
|
||||
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
|
||||
|
||||
// Set up authentication
|
||||
//
|
||||
// $ZENITH_AUTH_TOKEN will be replaced with value from environment
|
||||
// variable during compute pg startup. It is done this way because
|
||||
// otherwise user will be able to retrieve the value using SHOW
|
||||
// command or pg_settings
|
||||
let password = if let AuthType::ZenithJWT = auth_type {
|
||||
"$ZENITH_AUTH_TOKEN"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
format!("host={} port={} password={}", host, port, password)
|
||||
// set up authentication
|
||||
let password = if let AuthType::ZenithJWT = auth_type {
|
||||
"$ZENITH_AUTH_TOKEN"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
conf.append("shared_preload_libraries", "zenith");
|
||||
conf.append_line("");
|
||||
conf.append("zenith.page_server_connstring", &pageserver_connstr);
|
||||
conf.append("zenith.zenith_tenant", &self.tenantid.to_string());
|
||||
conf.append("zenith.zenith_timeline", &self.timelineid.to_string());
|
||||
conf.append_line("");
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
concat!(
|
||||
"shared_preload_libraries = zenith\n",
|
||||
// $ZENITH_AUTH_TOKEN will be replaced with value from environment variable during compute pg startup
|
||||
// it is done this way because otherwise user will be able to retrieve the value using SHOW command or pg_settings
|
||||
"zenith.page_server_connstring = 'host={} port={} password={}'\n",
|
||||
"zenith.zenith_timeline='{}'\n",
|
||||
"zenith.zenith_tenant='{}'\n",
|
||||
),
|
||||
host, port, password, self.timelineid, self.tenantid,
|
||||
)
|
||||
.as_str(),
|
||||
)?;
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
conf.append("synchronous_standby_names", "pageserver"); // TODO: add a new function arg?
|
||||
conf.append("zenith.callmemaybe_connstring", &self.connstr());
|
||||
|
||||
let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
|
||||
file.write_all(conf.to_string().as_bytes())?;
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
concat!(
|
||||
"synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
|
||||
"zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
),
|
||||
self.connstr(),
|
||||
)
|
||||
.as_str(),
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -351,6 +418,14 @@ impl PostgresNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append_conf(&self, config: &str, opts: &str) -> Result<()> {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata().join(config).to_str().unwrap())?
|
||||
.write_all(opts.as_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
|
||||
let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl");
|
||||
let mut cmd = Command::new(pg_ctl_path);
|
||||
@@ -452,7 +527,9 @@ impl PostgresNode {
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
assert!(output.status.success(), "whoami failed");
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
@@ -12,7 +12,6 @@ use std::path::Path;
|
||||
|
||||
pub mod compute;
|
||||
pub mod local_env;
|
||||
pub mod postgresql_conf;
|
||||
pub mod storage;
|
||||
|
||||
/// Read a PID file
|
||||
|
||||
@@ -1,212 +0,0 @@
|
||||
///
|
||||
/// Module for parsing postgresql.conf file.
|
||||
///
|
||||
/// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
|
||||
/// enough to extract a few settings we need in Zenith, assuming you don't do
|
||||
/// funny stuff like include-directives or funny escaping.
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::BufRead;
|
||||
use std::str::FromStr;
|
||||
|
||||
/// In-memory representation of a postgresql.conf file
|
||||
#[derive(Default)]
|
||||
pub struct PostgresConf {
|
||||
lines: Vec<String>,
|
||||
hash: HashMap<String, String>,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap();
|
||||
}
|
||||
|
||||
impl PostgresConf {
|
||||
pub fn new() -> PostgresConf {
|
||||
PostgresConf::default()
|
||||
}
|
||||
|
||||
/// Read file into memory
|
||||
pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
|
||||
let mut result = Self::new();
|
||||
|
||||
for line in std::io::BufReader::new(read).lines() {
|
||||
let line = line?;
|
||||
|
||||
// Store each line in a vector, in original format
|
||||
result.lines.push(line.clone());
|
||||
|
||||
// Also parse each line and insert key=value lines into a hash map.
|
||||
//
|
||||
// FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
|
||||
// But it's close enough for our usage.
|
||||
let line = line.trim();
|
||||
if line.starts_with('#') {
|
||||
// comment, ignore
|
||||
continue;
|
||||
} else if let Some(caps) = CONF_LINE_RE.captures(line) {
|
||||
let name = caps.get(1).unwrap().as_str();
|
||||
let raw_val = caps.get(2).unwrap().as_str();
|
||||
|
||||
if let Ok(val) = deescape_str(raw_val) {
|
||||
// Note: if there's already an entry in the hash map for
|
||||
// this key, this will replace it. That's the behavior what
|
||||
// we want; when PostgreSQL reads the file, each line
|
||||
// overrides any previous value for the same setting.
|
||||
result.hash.insert(name.to_string(), val.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Return the current value of 'option'
|
||||
pub fn get(&self, option: &str) -> Option<&str> {
|
||||
self.hash.get(option).map(|x| x.as_ref())
|
||||
}
|
||||
|
||||
/// Return the current value of a field, parsed to the right datatype.
|
||||
///
|
||||
/// This calls the FromStr::parse() function on the value of the field. If
|
||||
/// the field does not exist, or parsing fails, returns an error.
|
||||
///
|
||||
pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
self.get(field_name)
|
||||
.ok_or_else(|| anyhow!("could not find '{}' option {}", field_name, context))?
|
||||
.parse::<T>()
|
||||
.with_context(|| format!("could not parse '{}' option {}", field_name, context))
|
||||
}
|
||||
|
||||
///
|
||||
/// Note: if you call this multiple times for the same option, the config
|
||||
/// file will a line for each call. It would be nice to have a function
|
||||
/// to change an existing line, but that's a TODO.
|
||||
///
|
||||
pub fn append(&mut self, option: &str, value: &str) {
|
||||
self.lines
|
||||
.push(format!("{}={}\n", option, escape_str(value)));
|
||||
self.hash.insert(option.to_string(), value.to_string());
|
||||
}
|
||||
|
||||
/// Append an arbitrary non-setting line to the config file
|
||||
pub fn append_line(&mut self, line: &str) {
|
||||
self.lines.push(line.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for PostgresConf {
|
||||
/// Return the whole configuration file as a string
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for line in self.lines.iter() {
|
||||
f.write_str(line)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Escape a value for putting in postgresql.conf.
|
||||
fn escape_str(s: &str) -> String {
|
||||
// If the string doesn't contain anything that needs quoting or escaping, return it
|
||||
// as it is.
|
||||
//
|
||||
// The first part of the regex, before the '|', matches the INTEGER rule in the
|
||||
// PostgreSQL flex grammar (guc-file.l). It matches plain integers like "123" and
|
||||
// "-123", and also accepts units like "10MB". The second part of the regex matches
|
||||
// the UNQUOTED_STRING rule, and accepts strings that contain a single word, beginning
|
||||
// with a letter. That covers words like "off" or "posix". Everything else is quoted.
|
||||
//
|
||||
// This regex is a bit more conservative than the rules in guc-file.l, so we quote some
|
||||
// strings that PostgreSQL would accept without quoting, but that's OK.
|
||||
lazy_static! {
|
||||
static ref UNQUOTED_RE: Regex =
|
||||
Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap();
|
||||
}
|
||||
if UNQUOTED_RE.is_match(s) {
|
||||
s.to_string()
|
||||
} else {
|
||||
// Otherwise escape and quote it
|
||||
let s = s
|
||||
.replace('\\', "\\\\")
|
||||
.replace('\n', "\\n")
|
||||
.replace('\'', "''");
|
||||
|
||||
"\'".to_owned() + &s + "\'"
|
||||
}
|
||||
}
|
||||
|
||||
/// De-escape a possibly-quoted value.
|
||||
///
|
||||
/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
|
||||
/// does this.
|
||||
fn deescape_str(s: &str) -> Result<String> {
|
||||
// If the string has a quote at the beginning and end, strip them out.
|
||||
if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
|
||||
let mut result = String::new();
|
||||
|
||||
let mut iter = s[1..(s.len() - 1)].chars().peekable();
|
||||
while let Some(c) = iter.next() {
|
||||
let newc = if c == '\\' {
|
||||
match iter.next() {
|
||||
Some('b') => '\x08',
|
||||
Some('f') => '\x0c',
|
||||
Some('n') => '\n',
|
||||
Some('r') => '\r',
|
||||
Some('t') => '\t',
|
||||
Some('0'..='7') => {
|
||||
// TODO
|
||||
bail!("octal escapes not supported");
|
||||
}
|
||||
Some(n) => n,
|
||||
None => break,
|
||||
}
|
||||
} else if c == '\'' && iter.peek() == Some(&'\'') {
|
||||
// doubled quote becomes just one quote
|
||||
iter.next().unwrap()
|
||||
} else {
|
||||
c
|
||||
};
|
||||
|
||||
result.push(newc);
|
||||
}
|
||||
Ok(result)
|
||||
} else {
|
||||
Ok(s.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_postgresql_conf_escapes() -> Result<()> {
|
||||
assert_eq!(escape_str("foo bar"), "'foo bar'");
|
||||
// these don't need to be quoted
|
||||
assert_eq!(escape_str("foo"), "foo");
|
||||
assert_eq!(escape_str("123"), "123");
|
||||
assert_eq!(escape_str("+123"), "+123");
|
||||
assert_eq!(escape_str("-10"), "-10");
|
||||
assert_eq!(escape_str("1foo"), "1foo");
|
||||
assert_eq!(escape_str("foo1"), "foo1");
|
||||
assert_eq!(escape_str("10MB"), "10MB");
|
||||
assert_eq!(escape_str("-10kB"), "-10kB");
|
||||
|
||||
// these need quoting and/or escaping
|
||||
assert_eq!(escape_str("foo bar"), "'foo bar'");
|
||||
assert_eq!(escape_str("fo'o"), "'fo''o'");
|
||||
assert_eq!(escape_str("fo\no"), "'fo\\no'");
|
||||
assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
|
||||
assert_eq!(escape_str("10 cats"), "'10 cats'");
|
||||
|
||||
// Test de-escaping
|
||||
assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
|
||||
assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
|
||||
assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
|
||||
|
||||
// octal-escapes are currently not supported
|
||||
assert!(deescape_str("'foo\\7\\07\\007'").is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -85,20 +85,20 @@ impl PageServerNode {
|
||||
),
|
||||
env: env.clone(),
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://127.0.0.1:{}/v1", env.pageserver_http_port),
|
||||
http_base_url: format!("http://localhost:{}/v1", env.pageserver_http_port),
|
||||
}
|
||||
}
|
||||
|
||||
fn pageserver_connection_config(password: &str, port: u16) -> Config {
|
||||
format!("postgresql://no_user:{}@127.0.0.1:{}/no_db", password, port)
|
||||
format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> anyhow::Result<()> {
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let listen_pg = format!("127.0.0.1:{}", self.env.pageserver_pg_port);
|
||||
let listen_http = format!("127.0.0.1:{}", self.env.pageserver_http_port);
|
||||
let listen_pg = format!("localhost:{}", self.env.pageserver_pg_port);
|
||||
let listen_http = format!("localhost:{}", self.env.pageserver_http_port);
|
||||
let mut args = vec![
|
||||
"--init",
|
||||
"-D",
|
||||
@@ -199,45 +199,23 @@ impl PageServerNode {
|
||||
bail!("pageserver failed to start in {} seconds", RETRIES);
|
||||
}
|
||||
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
pub fn stop(&self) -> anyhow::Result<()> {
|
||||
let pid = read_pidfile(&self.pid_file())?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
if immediate {
|
||||
println!("Stop pageserver immediately");
|
||||
if kill(pid, Signal::SIGQUIT).is_err() {
|
||||
bail!("Failed to kill pageserver with pid {}", pid);
|
||||
}
|
||||
} else {
|
||||
println!("Stop pageserver gracefully");
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
}
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to kill pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
// wait for pageserver stop
|
||||
let address = connection_address(&self.pg_connection_config);
|
||||
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if pageserver checkpoints a lot of data
|
||||
for _ in 0..100 {
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Pageserver stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("Pageserver status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
println!("Pageserver status is: {}", err);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Pageserver still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(&address);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
if let Err(_e) = stream {
|
||||
println!("Pageserver stopped");
|
||||
return Ok(());
|
||||
}
|
||||
println!("Stopping pageserver on {}", address);
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
@@ -335,9 +313,8 @@ impl PageServerNode {
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
// TODO Looks like this flag is never set
|
||||
if self.kill_on_exit {
|
||||
let _ = self.stop(true);
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ if [ "$1" = 'pageserver' ]; then
|
||||
pageserver --init -D /data --postgres-distrib /usr/local
|
||||
fi
|
||||
echo "Staring pageserver at 0.0.0.0:6400"
|
||||
pageserver -l 0.0.0.0:6400 --listen-http 0.0.0.0:9898 -D /data
|
||||
pageserver -l 0.0.0.0:6400 -D /data
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
|
||||
@@ -10,5 +10,5 @@
|
||||
- [pageserver/README](/pageserver/README) — pageserver overview.
|
||||
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
|
||||
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
|
||||
- [walkeeper/README](/walkeeper/README) — WAL service overview.
|
||||
- [walkeeper/README](/walkeeper/README.md) — WAL service overview.
|
||||
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
Currently we build two main images:
|
||||
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
|
||||
|
||||
And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
|
||||
|
||||
@@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id
|
||||
|
||||
### Safety
|
||||
|
||||
For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
|
||||
For now particular tenant can only appear on a particular pageserver. Set of WAL acceptors are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
|
||||
|
||||
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.11", features = ["process", "macros", "fs", "rt"] }
|
||||
tokio = { version = "1.11", features = ["process", "macros", "fs"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
@@ -34,14 +34,8 @@ toml = "0.5"
|
||||
scopeguard = "1.1.0"
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
async-trait = "0.1"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.27"
|
||||
signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
hex-literal = "0.3"
|
||||
|
||||
@@ -7,9 +7,8 @@ The Page Server has a few different duties:
|
||||
- Replay WAL that's applicable to the chunks that the Page Server maintains
|
||||
- Backup to S3
|
||||
|
||||
S3 is the main fault-tolerant storage of all data, as there are no Page Server
|
||||
replicas. We use a separate fault-tolerant WAL service to reduce latency. It
|
||||
keeps track of WAL records which are not syncted to S3 yet.
|
||||
|
||||
|
||||
|
||||
The Page Server consists of multiple threads that operate on a shared
|
||||
repository of page versions:
|
||||
|
||||
@@ -31,7 +31,7 @@ use zenith_utils::lsn::Lsn;
|
||||
pub struct Basebackup<'a> {
|
||||
ar: Builder<&'a mut dyn Write>,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
pub lsn: Lsn,
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -97,6 +97,7 @@ impl<'a> Basebackup<'a> {
|
||||
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
|
||||
// Create pgdata subdirs structure
|
||||
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
||||
info!("send subdir {:?}", *dir);
|
||||
let header = new_tar_header_dir(*dir)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
}
|
||||
|
||||
@@ -2,44 +2,28 @@
|
||||
// Main entry point for the Page Server executable
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use pageserver::defaults::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
env,
|
||||
net::TcpListener,
|
||||
path::{Path, PathBuf},
|
||||
process::exit,
|
||||
str::FromStr,
|
||||
thread,
|
||||
};
|
||||
use tracing::*;
|
||||
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use signal_hook::consts::signal::*;
|
||||
use signal_hook::consts::TERM_SIGNALS;
|
||||
use signal_hook::flag;
|
||||
use signal_hook::iterator::exfiltrator::WithOrigin;
|
||||
use signal_hook::iterator::SignalsInfo;
|
||||
use std::process::exit;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use pageserver::{
|
||||
branches,
|
||||
defaults::{
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
|
||||
},
|
||||
http, page_service, relish_storage, tenant_mgr, PageServerConf, RelishStorageConfig,
|
||||
RelishStorageKind, S3Config, LOG_FILE_NAME,
|
||||
branches, http, page_service, tenant_mgr, PageServerConf, RelishStorageConfig, S3Config,
|
||||
LOG_FILE_NAME,
|
||||
};
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::postgres_backend;
|
||||
|
||||
use const_format::formatcp;
|
||||
|
||||
/// String arguments that can be declared via CLI or config file
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -55,7 +39,6 @@ struct CfgFileParams {
|
||||
auth_type: Option<String>,
|
||||
// see https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for enum deserialisation examples
|
||||
relish_storage: Option<RelishStorage>,
|
||||
relish_storage_max_concurrent_sync: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -106,7 +89,6 @@ impl CfgFileParams {
|
||||
auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
|
||||
auth_type: get_arg("auth-type"),
|
||||
relish_storage,
|
||||
relish_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,9 +108,6 @@ impl CfgFileParams {
|
||||
.or(other.auth_validation_public_key_path),
|
||||
auth_type: self.auth_type.or(other.auth_type),
|
||||
relish_storage: self.relish_storage.or(other.relish_storage),
|
||||
relish_storage_max_concurrent_sync: self
|
||||
.relish_storage_max_concurrent_sync
|
||||
.or(other.relish_storage_max_concurrent_sync),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,34 +176,25 @@ impl CfgFileParams {
|
||||
);
|
||||
}
|
||||
|
||||
let max_concurrent_sync = match self.relish_storage_max_concurrent_sync.as_deref() {
|
||||
Some(relish_storage_max_concurrent_sync) => {
|
||||
relish_storage_max_concurrent_sync.parse()?
|
||||
}
|
||||
None => DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
|
||||
};
|
||||
let relish_storage_config = self.relish_storage.as_ref().map(|storage_params| {
|
||||
let storage = match storage_params.clone() {
|
||||
RelishStorage::Local { local_path } => {
|
||||
RelishStorageKind::LocalFs(PathBuf::from(local_path))
|
||||
}
|
||||
RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
} => RelishStorageKind::AwsS3(S3Config {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
}),
|
||||
};
|
||||
RelishStorageConfig {
|
||||
max_concurrent_sync,
|
||||
storage,
|
||||
}
|
||||
});
|
||||
let relish_storage_config =
|
||||
self.relish_storage
|
||||
.as_ref()
|
||||
.map(|storage_params| match storage_params.clone() {
|
||||
RelishStorage::Local { local_path } => {
|
||||
RelishStorageConfig::LocalFs(PathBuf::from(local_path))
|
||||
}
|
||||
RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
} => RelishStorageConfig::AwsS3(S3Config {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
}),
|
||||
});
|
||||
|
||||
Ok(PageServerConf {
|
||||
daemonize: false,
|
||||
@@ -250,7 +220,6 @@ impl CfgFileParams {
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("pageserver");
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.arg(
|
||||
@@ -259,14 +228,14 @@ fn main() -> Result<()> {
|
||||
.long("listen-pg")
|
||||
.alias("listen") // keep some compatibility
|
||||
.takes_value(true)
|
||||
.help(formatcp!("listen for incoming page requests on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen-http")
|
||||
.long("listen-http")
|
||||
.alias("http_endpoint") // keep some compatibility
|
||||
.takes_value(true)
|
||||
.help(formatcp!("http endpoint address for metrics and management API calls on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
|
||||
.help("http endpoint address for for metrics and management API calls ip:port (default: 127.0.0.1:5430)"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("daemonize")
|
||||
@@ -374,19 +343,10 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Credentials to access the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("relish-storage-max-concurrent-sync")
|
||||
.long("relish-storage-max-concurrent-sync")
|
||||
.takes_value(true)
|
||||
.help("Maximum allowed concurrent synchronisations with storage"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
let cfg_file_path = workdir
|
||||
.canonicalize()
|
||||
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?
|
||||
.join("pageserver.toml");
|
||||
let cfg_file_path = workdir.canonicalize()?.join("pageserver.toml");
|
||||
|
||||
let args_params = CfgFileParams::from_args(&arg_matches);
|
||||
|
||||
@@ -398,37 +358,22 @@ fn main() -> Result<()> {
|
||||
args_params
|
||||
} else {
|
||||
// Supplement the CLI arguments with the config file
|
||||
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)
|
||||
.with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?;
|
||||
let file_params: CfgFileParams = toml::from_str(&cfg_file_contents).with_context(|| {
|
||||
format!(
|
||||
"Failed to read '{}' as pageserver config",
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)?;
|
||||
let file_params: CfgFileParams = toml::from_str(&cfg_file_contents)?;
|
||||
args_params.or(file_params)
|
||||
};
|
||||
|
||||
// Set CWD to workdir for non-daemon modes
|
||||
env::set_current_dir(&workdir).with_context(|| {
|
||||
format!(
|
||||
"Failed to set application's current dir to '{}'",
|
||||
workdir.display()
|
||||
)
|
||||
})?;
|
||||
env::set_current_dir(&workdir)?;
|
||||
|
||||
// Ensure the config is valid, even if just init-ing
|
||||
let mut conf = params.try_into_config().with_context(|| {
|
||||
format!(
|
||||
"Pageserver config at '{}' is not valid",
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut conf = params.try_into_config()?;
|
||||
|
||||
conf.daemonize = arg_matches.is_present("daemonize");
|
||||
|
||||
if init && conf.daemonize {
|
||||
bail!("--daemonize cannot be used with --init")
|
||||
eprintln!("--daemonize cannot be used with --init");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// The configuration is all set up now. Turn it into a 'static
|
||||
@@ -438,37 +383,21 @@ fn main() -> Result<()> {
|
||||
|
||||
// Create repo and exit if init was requested
|
||||
if init {
|
||||
branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
|
||||
branches::init_pageserver(conf, create_tenant)?;
|
||||
// write the config file
|
||||
let cfg_file_contents = toml::to_string_pretty(¶ms)
|
||||
.context("Failed to create pageserver config contents for initialisation")?;
|
||||
let cfg_file_contents = toml::to_string_pretty(¶ms)?;
|
||||
// TODO support enable-auth flag
|
||||
std::fs::write(&cfg_file_path, cfg_file_contents).with_context(|| {
|
||||
format!(
|
||||
"Failed to initialize pageserver config at '{}'",
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
} else {
|
||||
start_pageserver(conf).context("Failed to start pageserver")
|
||||
std::fs::write(&cfg_file_path, cfg_file_contents)?;
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
start_pageserver(conf)
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
// Initialize logger
|
||||
let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;
|
||||
|
||||
let term_now = Arc::new(AtomicBool::new(false));
|
||||
for sig in TERM_SIGNALS {
|
||||
// When terminated by a second term signal, exit with exit code 1.
|
||||
// This will do nothing the first time (because term_now is false).
|
||||
flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
|
||||
// But this will "arm" the above for the second time, by setting it to true.
|
||||
// The order of registering these is important, if you put this one first, it will
|
||||
// first arm and then terminate ‒ all in the first round.
|
||||
flag::register(*sig, Arc::clone(&term_now))?;
|
||||
}
|
||||
let (_scope_guard, log_file) = logging::init(LOG_FILE_NAME, conf.daemonize)?;
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
@@ -501,20 +430,16 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
|
||||
match daemonize.start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(err) => error!(%err, "could not daemonize"),
|
||||
Err(e) => error!("Error, {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
// keep join handles for spawned threads
|
||||
// don't spawn threads before daemonizing
|
||||
let mut join_handles = Vec::new();
|
||||
|
||||
if let Some(handle) = relish_storage::run_storage_sync_thread(conf)? {
|
||||
join_handles.push(handle);
|
||||
}
|
||||
// Initialize tenant manager.
|
||||
tenant_mgr::init(conf);
|
||||
|
||||
// keep join handles for spawned threads
|
||||
let mut join_handles = vec![];
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
AuthType::Trust | AuthType::MD5 => None,
|
||||
@@ -546,42 +471,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
|
||||
})?;
|
||||
|
||||
for info in SignalsInfo::<WithOrigin>::new(TERM_SIGNALS)?.into_iter() {
|
||||
match info.signal {
|
||||
SIGQUIT => {
|
||||
info!("Got SIGQUIT. Terminate pageserver in immediate shutdown mode");
|
||||
exit(111);
|
||||
}
|
||||
SIGTERM => {
|
||||
info!("Got SIGINT/SIGTERM. Terminate gracefully in fast shutdown mode");
|
||||
// Terminate postgres backends
|
||||
postgres_backend::set_pgbackend_shutdown_requested();
|
||||
// Stop all tenants and flush their data
|
||||
tenant_mgr::shutdown_all_tenants()?;
|
||||
// Wait for pageservice thread to complete the job
|
||||
page_service_thread
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error");
|
||||
join_handles.push(page_service_thread);
|
||||
|
||||
// Shut down http router
|
||||
endpoint::shutdown();
|
||||
|
||||
// Wait for all threads
|
||||
for handle in join_handles.into_iter() {
|
||||
handle
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error");
|
||||
}
|
||||
info!("Pageserver shut down successfully completed");
|
||||
exit(0);
|
||||
}
|
||||
_ => {
|
||||
debug!("Unknown signal.");
|
||||
}
|
||||
}
|
||||
for handle in join_handles.into_iter() {
|
||||
handle
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error")
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -14,12 +14,11 @@ use std::{
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
};
|
||||
use tracing::*;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use zenith_utils::crashsafe_dir;
|
||||
use log::*;
|
||||
use zenith_utils::logging;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::tenant_mgr;
|
||||
use crate::walredo::WalRedoManager;
|
||||
@@ -100,7 +99,7 @@ pub struct PointInTime {
|
||||
pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
|
||||
// Initialize logger
|
||||
// use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
|
||||
let _log_file = logging::init(LOG_FILE_NAME, true)?;
|
||||
let (_scope_guard, _log_file) = logging::init(LOG_FILE_NAME, true)?;
|
||||
|
||||
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
|
||||
// process during repository initialization.
|
||||
@@ -119,7 +118,7 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
|
||||
println!("initializing tenantid {}", tenantid);
|
||||
create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?;
|
||||
}
|
||||
crashsafe_dir::create_dir_all(conf.tenants_path())?;
|
||||
fs::create_dir_all(conf.tenants_path())?;
|
||||
|
||||
println!("pageserver init succeeded");
|
||||
Ok(())
|
||||
@@ -136,12 +135,12 @@ pub fn create_repo(
|
||||
}
|
||||
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
crashsafe_dir::create_dir_all(&repo_dir)
|
||||
fs::create_dir_all(&repo_dir)
|
||||
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
|
||||
|
||||
crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?;
|
||||
fs::create_dir(conf.timelines_path(&tenantid))?;
|
||||
fs::create_dir_all(conf.branches_path(&tenantid))?;
|
||||
fs::create_dir_all(conf.tags_path(&tenantid))?;
|
||||
|
||||
info!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
@@ -151,13 +150,12 @@ pub fn create_repo(
|
||||
conf,
|
||||
wal_redo_manager,
|
||||
tenantid,
|
||||
false,
|
||||
));
|
||||
|
||||
// Load data into pageserver
|
||||
// TODO To implement zenith import we need to
|
||||
// move data loading out of create_repo()
|
||||
bootstrap_timeline(conf, tenantid, tli, repo.as_ref())?;
|
||||
bootstrap_timeline(conf, tenantid, tli, &*repo)?;
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
@@ -176,16 +174,13 @@ fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
|
||||
// to get bootstrap data for timeline initialization.
|
||||
//
|
||||
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
info!("running initdb in {}... ", initdbpath.display());
|
||||
info!("running initdb... ");
|
||||
|
||||
let initdb_path = conf.pg_bin_dir().join("initdb");
|
||||
let initdb_output = Command::new(initdb_path)
|
||||
.args(&["-D", initdbpath.to_str().unwrap()])
|
||||
.args(&["-U", &conf.superuser])
|
||||
.arg("--no-instructions")
|
||||
// This is only used for a temporary installation that is deleted shortly after,
|
||||
// so no need to fsync it
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
@@ -198,6 +193,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
String::from_utf8_lossy(&initdb_output.stderr)
|
||||
);
|
||||
}
|
||||
info!("initdb succeeded");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -212,8 +208,6 @@ fn bootstrap_timeline(
|
||||
tli: ZTimelineId,
|
||||
repo: &dyn Repository,
|
||||
) -> Result<()> {
|
||||
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
|
||||
|
||||
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
|
||||
|
||||
// Init temporarily repo to get bootstrap data
|
||||
@@ -222,14 +216,12 @@ fn bootstrap_timeline(
|
||||
|
||||
let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
|
||||
info!("bootstrap_timeline {:?} at lsn {}", pgdata_path, lsn);
|
||||
|
||||
// Import the contents of the data directory at the initial checkpoint
|
||||
// LSN, and any WAL after that.
|
||||
let timeline = repo.create_empty_timeline(tli)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(
|
||||
&pgdata_path,
|
||||
timeline.writer().as_ref(),
|
||||
lsn,
|
||||
)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
|
||||
timeline.checkpoint()?;
|
||||
|
||||
println!(
|
||||
@@ -421,6 +413,7 @@ fn create_timeline(
|
||||
let timelinedir = conf.timeline_path(&timelineid, tenantid);
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Result;
|
||||
@@ -5,7 +6,6 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use routerify::{ext::RequestExt, RouterBuilder};
|
||||
use tracing::*;
|
||||
use zenith_utils::auth::JwtAuth;
|
||||
use zenith_utils::http::endpoint::attach_openapi_ui;
|
||||
use zenith_utils::http::endpoint::auth_middleware;
|
||||
@@ -15,8 +15,6 @@ use zenith_utils::http::{
|
||||
endpoint,
|
||||
error::HttpErrorBody,
|
||||
json::{json_request, json_response},
|
||||
request::get_request_param,
|
||||
request::parse_request_param,
|
||||
};
|
||||
|
||||
use super::models::BranchCreateRequest;
|
||||
@@ -58,6 +56,33 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
|
||||
get_state(request).conf
|
||||
}
|
||||
|
||||
fn get_request_param<'a>(
|
||||
request: &'a Request<Body>,
|
||||
param_name: &str,
|
||||
) -> Result<&'a str, ApiError> {
|
||||
match request.param(param_name) {
|
||||
Some(arg) => Ok(arg),
|
||||
None => {
|
||||
return Err(ApiError::BadRequest(format!(
|
||||
"no {} specified in path param",
|
||||
param_name
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_request_param<T: FromStr>(
|
||||
request: &Request<Body>,
|
||||
param_name: &str,
|
||||
) -> Result<T, ApiError> {
|
||||
match get_request_param(request, param_name)?.parse() {
|
||||
Ok(v) => Ok(v),
|
||||
Err(_) => Err(ApiError::BadRequest(
|
||||
"failed to parse tenant id".to_string(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
// healthcheck handler
|
||||
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
Ok(Response::builder()
|
||||
@@ -73,7 +98,6 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
check_permission(&request, Some(request_data.tenant_id))?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered();
|
||||
branches::create_branch(
|
||||
get_config(&request),
|
||||
&request_data.name,
|
||||
@@ -92,7 +116,6 @@ async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
check_permission(&request, Some(tenantid))?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_list", tenant = %tenantid).entered();
|
||||
crate::branches::get_branches(get_config(&request), &tenantid)
|
||||
})
|
||||
.await
|
||||
@@ -103,12 +126,11 @@ async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
// TODO add to swagger
|
||||
async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
|
||||
let branch_name: &str = get_request_param(&request, "branch_name")?;
|
||||
let conf = get_state(&request).conf;
|
||||
let path = conf.branch_path(&branch_name, &tenantid);
|
||||
let path = conf.branch_path(branch_name, &tenantid);
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
BranchInfo::from_path(path, conf, &tenantid, &repo)
|
||||
})
|
||||
@@ -122,13 +144,10 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
// check for management permission
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_list").entered();
|
||||
crate::branches::get_tenants(get_config(&request))
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let response_data =
|
||||
tokio::task::spawn_blocking(move || crate::branches::get_tenants(get_config(&request)))
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
Ok(json_response(StatusCode::OK, response_data)?)
|
||||
}
|
||||
|
||||
@@ -139,7 +158,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
|
||||
tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
|
||||
})
|
||||
.await
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -25,13 +25,11 @@ OnDisk layers can be Image or Delta:
|
||||
Dropped segments are always represented on disk by DeltaLayer.
|
||||
|
||||
LSN range defined by start_lsn and end_lsn:
|
||||
- start_lsn is inclusive.
|
||||
- end_lsn is exclusive.
|
||||
|
||||
For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen
|
||||
in-memory layer or a delta layer, it is a valid end bound. An image
|
||||
layer represents snapshot at one LSN, so end_lsn is always the
|
||||
snapshot LSN + 1
|
||||
- start_lsn is always inclusive.
|
||||
- end_lsn depends on layer kind:
|
||||
- InMemoryLayer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn)
|
||||
- ImageLayer represents snapshot at one LSN, so end_lsn = lsn.
|
||||
- DeltaLayer has explicit end_lsn, which represents end of incremental layer.
|
||||
|
||||
Layers can be open or historical:
|
||||
- Open layer is a writeable one. Only InMemory layer can be open.
|
||||
|
||||
@@ -42,13 +42,15 @@ use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
|
||||
};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
use std::collections::BTreeMap;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
@@ -57,7 +59,7 @@ use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
|
||||
use bookfile::{Book, BookWriter};
|
||||
|
||||
@@ -107,6 +109,12 @@ impl From<&DeltaLayer> for Summary {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PageVersionMeta {
|
||||
page_image_range: Option<BlobRange>,
|
||||
record_range: Option<BlobRange>,
|
||||
}
|
||||
|
||||
///
|
||||
/// DeltaLayer is the in-memory data structure associated with an
|
||||
/// on-disk delta file. We keep a DeltaLayer in memory for each
|
||||
@@ -131,6 +139,9 @@ pub struct DeltaLayer {
|
||||
|
||||
dropped: bool,
|
||||
|
||||
/// Predecessor layer
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
|
||||
inner: Mutex<DeltaLayerInner>,
|
||||
}
|
||||
|
||||
@@ -141,10 +152,10 @@ pub struct DeltaLayerInner {
|
||||
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: VecMap<(u32, Lsn), BlobRange>,
|
||||
page_version_metas: BTreeMap<(u32, Lsn), PageVersionMeta>,
|
||||
|
||||
/// `relsizes` tracks the size of the relation at different points in time.
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
@@ -169,7 +180,29 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
PathBuf::from(
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
}
|
||||
.to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
fn path(&self) -> Option<PathBuf> {
|
||||
Some(Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
@@ -193,22 +226,20 @@ impl Layer for DeltaLayer {
|
||||
// Scan the metadata BTreeMap backwards, starting from the given entry.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let iter = inner
|
||||
let mut iter = inner
|
||||
.page_version_metas
|
||||
.slice_range((Included(&minkey), Included(&maxkey)))
|
||||
.iter()
|
||||
.rev();
|
||||
for ((_blknum, pv_lsn), blob_range) in iter {
|
||||
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
|
||||
|
||||
if let Some(img) = pv.page_image {
|
||||
.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
|
||||
if let Some(img_range) = &entry.page_image_range {
|
||||
// Found a page image, return it
|
||||
let img = Bytes::from(read_blob(&page_version_reader, img_range)?);
|
||||
reconstruct_data.page_img = Some(img);
|
||||
need_image = false;
|
||||
break;
|
||||
} else if let Some(rec) = pv.record {
|
||||
} else if let Some(rec_range) = &entry.record_range {
|
||||
let rec = WALRecord::des(&read_blob(&page_version_reader, rec_range)?)?;
|
||||
let will_init = rec.will_init;
|
||||
reconstruct_data.records.push((*pv_lsn, rec));
|
||||
reconstruct_data.records.push(rec);
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
@@ -224,9 +255,16 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
// caller know about the predecessor layer.
|
||||
if need_image {
|
||||
Ok(PageReconstructResult::Continue(self.start_lsn))
|
||||
if let Some(cont_layer) = &self.predecessor {
|
||||
Ok(PageReconstructResult::Continue(
|
||||
self.start_lsn,
|
||||
Arc::clone(cont_layer),
|
||||
))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Missing(self.start_lsn))
|
||||
}
|
||||
} else {
|
||||
Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
@@ -235,22 +273,21 @@ impl Layer for DeltaLayer {
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let inner = self.load()?;
|
||||
let slice = inner
|
||||
.relsizes
|
||||
.slice_range((Included(&Lsn(0)), Included(&lsn)));
|
||||
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
Ok(*entry)
|
||||
let result;
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
result = *entry;
|
||||
// Use the base image if needed
|
||||
} else if let Some(predecessor) = &self.predecessor {
|
||||
result = predecessor.get_seg_size(lsn)?;
|
||||
} else {
|
||||
Err(anyhow::anyhow!("could not find seg size in delta layer"))
|
||||
result = 0;
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
@@ -270,15 +307,17 @@ impl Layer for DeltaLayer {
|
||||
///
|
||||
fn unload(&self) -> Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.page_version_metas = VecMap::default();
|
||||
inner.relsizes = VecMap::default();
|
||||
inner.page_version_metas = BTreeMap::new();
|
||||
inner.relsizes = BTreeMap::new();
|
||||
inner.loaded = false;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
if let Some(path) = self.path() {
|
||||
fs::remove_file(path)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -295,22 +334,22 @@ impl Layer for DeltaLayer {
|
||||
|
||||
println!("--- relsizes ---");
|
||||
let inner = self.load()?;
|
||||
for (k, v) in inner.relsizes.as_slice() {
|
||||
for (k, v) in inner.relsizes.iter() {
|
||||
println!(" {}: {}", k, v);
|
||||
}
|
||||
println!("--- page versions ---");
|
||||
let (_path, book) = self.open_book()?;
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
|
||||
for (k, v) in inner.page_version_metas.iter() {
|
||||
let mut desc = String::new();
|
||||
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
let pv = PageVersion::des(&buf)?;
|
||||
|
||||
if let Some(img) = pv.page_image.as_ref() {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
if let Some(page_image_range) = v.page_image_range.as_ref() {
|
||||
let image = read_blob(&chapter, page_image_range)?;
|
||||
write!(&mut desc, " img {} bytes", image.len())?;
|
||||
}
|
||||
if let Some(rec) = pv.record.as_ref() {
|
||||
if let Some(record_range) = v.record_range.as_ref() {
|
||||
let record_bytes = read_blob(&chapter, record_range)?;
|
||||
let rec = WALRecord::des(&record_bytes)?;
|
||||
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
@@ -320,7 +359,7 @@ impl Layer for DeltaLayer {
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
println!(" blk {} at {}: {}", blk, lsn, desc);
|
||||
println!(" blk {} at {}: {}", k.0, k.1, desc);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -342,15 +381,14 @@ impl DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new delta file, using the given page versions and relsizes.
|
||||
/// The page versions are passed by an iterator; the iterator must return
|
||||
/// page versions in blknum+lsn order.
|
||||
/// Create a new delta file, using the given btreemaps containing the page versions and
|
||||
/// relsizes.
|
||||
///
|
||||
/// This is used to write the in-memory layer to disk. The in-memory layer uses the same
|
||||
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
|
||||
/// expedient.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn create<'a>(
|
||||
pub fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
@@ -358,13 +396,10 @@ impl DeltaLayer {
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
page_versions: impl Iterator<Item = (u32, Lsn, &'a PageVersion)>,
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
) -> Result<DeltaLayer> {
|
||||
if seg.rel.is_blocky() {
|
||||
assert!(!relsizes.is_empty());
|
||||
}
|
||||
|
||||
let delta_layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
@@ -375,14 +410,17 @@ impl DeltaLayer {
|
||||
dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: true,
|
||||
page_version_metas: VecMap::default(),
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes,
|
||||
}),
|
||||
predecessor,
|
||||
};
|
||||
let mut inner = delta_layer.inner.lock().unwrap();
|
||||
|
||||
// Write the in-memory btreemaps into a file
|
||||
let path = delta_layer.path();
|
||||
let path = delta_layer
|
||||
.path()
|
||||
.expect("DeltaLayer is supposed to have a layer path on disk");
|
||||
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
@@ -392,27 +430,42 @@ impl DeltaLayer {
|
||||
|
||||
let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);
|
||||
|
||||
for (blknum, lsn, page_version) in page_versions {
|
||||
let buf = PageVersion::ser(page_version)?;
|
||||
let blob_range = page_version_writer.write_blob(&buf)?;
|
||||
for (key, page_version) in page_versions {
|
||||
let page_image_range = page_version
|
||||
.page_image
|
||||
.map(|page_image| page_version_writer.write_blob(page_image.as_ref()))
|
||||
.transpose()?;
|
||||
|
||||
inner
|
||||
.page_version_metas
|
||||
.append((blknum, lsn), blob_range)
|
||||
.unwrap();
|
||||
let record_range = page_version
|
||||
.record
|
||||
.map(|record| {
|
||||
let buf = WALRecord::ser(&record)?;
|
||||
page_version_writer.write_blob(&buf)
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
let old = inner.page_version_metas.insert(
|
||||
key,
|
||||
PageVersionMeta {
|
||||
page_image_range,
|
||||
record_range,
|
||||
},
|
||||
);
|
||||
|
||||
assert!(old.is_none());
|
||||
}
|
||||
|
||||
let book = page_version_writer.close()?;
|
||||
|
||||
// Write out page versions
|
||||
let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
|
||||
let buf = VecMap::ser(&inner.page_version_metas)?;
|
||||
let buf = BTreeMap::ser(&inner.page_version_metas)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// and relsizes to separate chapter
|
||||
let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
|
||||
let buf = VecMap::ser(&inner.relsizes)?;
|
||||
let buf = BTreeMap::ser(&inner.relsizes)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
@@ -431,8 +484,7 @@ impl DeltaLayer {
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
book.close()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
@@ -446,7 +498,12 @@ impl DeltaLayer {
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&self.layer_name(),
|
||||
&DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
},
|
||||
);
|
||||
|
||||
let file = File::open(&path)?;
|
||||
@@ -494,10 +551,10 @@ impl DeltaLayer {
|
||||
}
|
||||
|
||||
let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
|
||||
let page_version_metas = VecMap::des(&chapter)?;
|
||||
let page_version_metas = BTreeMap::des(&chapter)?;
|
||||
|
||||
let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
|
||||
let relsizes = VecMap::des(&chapter)?;
|
||||
let relsizes = BTreeMap::des(&chapter)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
@@ -516,6 +573,7 @@ impl DeltaLayer {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
filename: &DeltaFileName,
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
) -> DeltaLayer {
|
||||
DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
@@ -527,9 +585,10 @@ impl DeltaLayer {
|
||||
dropped: filename.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes: BTreeMap::new(),
|
||||
}),
|
||||
predecessor,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -550,28 +609,10 @@ impl DeltaLayer {
|
||||
dropped: summary.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes: BTreeMap::new(),
|
||||
}),
|
||||
predecessor: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> DeltaFileName {
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
}
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,8 +13,6 @@ use anyhow::Result;
|
||||
use log::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::METADATA_FILE_NAME;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
pub struct DeltaFileName {
|
||||
@@ -37,7 +35,7 @@ impl DeltaFileName {
|
||||
/// Parse a string as a delta file name. Returns None if the filename does not
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
pub fn from_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
@@ -170,7 +168,7 @@ impl ImageFileName {
|
||||
/// Parse a string as an image file name. Returns None if the filename does not
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
pub fn from_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
@@ -288,11 +286,11 @@ pub fn list_files(
|
||||
let fname = direntry?.file_name();
|
||||
let fname = fname.to_str().unwrap();
|
||||
|
||||
if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
|
||||
if let Some(deltafilename) = DeltaFileName::from_str(fname) {
|
||||
deltafiles.push(deltafilename);
|
||||
} else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
|
||||
} else if let Some(imgfilename) = ImageFileName::from_str(fname) {
|
||||
imgfiles.push(imgfilename);
|
||||
} else if fname == METADATA_FILE_NAME || fname == "ancestor" || fname.ends_with(".old") {
|
||||
} else if fname == "wal" || fname == "metadata" || fname == "ancestor" {
|
||||
// ignore these
|
||||
} else {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
|
||||
@@ -114,7 +114,25 @@ pub struct ImageLayerInner {
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
PathBuf::from(
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
}
|
||||
.to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
fn path(&self) -> Option<PathBuf> {
|
||||
Some(Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
@@ -134,8 +152,7 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
// End-bound is exclusive
|
||||
self.lsn + 1
|
||||
self.lsn
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
@@ -204,7 +221,9 @@ impl Layer for ImageLayer {
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
if let Some(path) = self.path() {
|
||||
fs::remove_file(path)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -280,7 +299,9 @@ impl ImageLayer {
|
||||
let inner = layer.inner.lock().unwrap();
|
||||
|
||||
// Write the images into a file
|
||||
let path = layer.path();
|
||||
let path = layer
|
||||
.path()
|
||||
.expect("ImageLayer is supposed to have a layer path on disk");
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let file = File::create(&path)?;
|
||||
@@ -315,10 +336,9 @@ impl ImageLayer {
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
book.close()?;
|
||||
|
||||
trace!("saved {}", path.display());
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
drop(inner);
|
||||
|
||||
@@ -423,7 +443,15 @@ impl ImageLayer {
|
||||
}
|
||||
|
||||
fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
|
||||
let path = self.path();
|
||||
let path = Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
},
|
||||
);
|
||||
|
||||
let file = File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
@@ -470,21 +498,4 @@ impl ImageLayer {
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> ImageFileName {
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
}
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,17 +12,16 @@ use crate::layered_repository::{DeltaLayer, ImageLayer};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::page_versions::PageVersions;
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
@@ -35,56 +34,57 @@ pub struct InMemoryLayer {
|
||||
///
|
||||
start_lsn: Lsn,
|
||||
|
||||
/// Frozen in-memory layers have an inclusive end LSN.
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
/// LSN of the oldest page version stored in this layer
|
||||
oldest_pending_lsn: Lsn,
|
||||
|
||||
/// The above fields never change. The parts that do change are in 'inner',
|
||||
/// and protected by mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
|
||||
/// Predecessor layer might be needed?
|
||||
incremental: bool,
|
||||
inner: Mutex<InMemoryLayerInner>,
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// Frozen in-memory layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is None
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
/// If this relation was dropped, remember when that happened.
|
||||
/// The drop LSN is recorded in [`end_lsn`].
|
||||
dropped: bool,
|
||||
drop_lsn: Option<Lsn>,
|
||||
|
||||
///
|
||||
/// All versions of all pages in the layer are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
///
|
||||
page_versions: PageVersions,
|
||||
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||
|
||||
///
|
||||
/// `segsizes` tracks the size of the segment at different points in time.
|
||||
///
|
||||
/// For a blocky rel, there is always one entry, at the layer's start_lsn,
|
||||
/// so that determining the size never depends on the predecessor layer. For
|
||||
/// a non-blocky rel, 'segsizes' is not used and is always empty.
|
||||
///
|
||||
segsizes: VecMap<Lsn, u32>,
|
||||
segsizes: BTreeMap<Lsn, u32>,
|
||||
|
||||
/// Writes are only allowed when true.
|
||||
/// Set to false when this layer is in the process of being replaced.
|
||||
writeable: bool,
|
||||
|
||||
/// Predecessor layer
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
impl InMemoryLayerInner {
|
||||
fn assert_writeable(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
fn check_writeable(&self) -> WriteResult<()> {
|
||||
if self.writeable {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(NonWriteableError)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_seg_size(&self, lsn: Lsn) -> u32 {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let slice = self.segsizes.slice_range(..=lsn);
|
||||
let mut iter = self.segsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
// We make sure there is always at least one entry
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
*entry
|
||||
} else {
|
||||
panic!("could not find seg size in in-memory layer");
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -93,26 +93,33 @@ impl Layer for InMemoryLayer {
|
||||
// An in-memory layer doesn't really have a filename as it's not stored on disk,
|
||||
// but we construct a filename as if it was a delta layer
|
||||
fn filename(&self) -> PathBuf {
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
let end_lsn;
|
||||
if let Some(drop_lsn) = inner.end_lsn {
|
||||
let dropped;
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
end_lsn = drop_lsn;
|
||||
dropped = true;
|
||||
} else {
|
||||
end_lsn = Lsn(u64::MAX);
|
||||
dropped = false;
|
||||
}
|
||||
|
||||
let delta_filename = DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn,
|
||||
dropped: inner.dropped,
|
||||
dropped,
|
||||
}
|
||||
.to_string();
|
||||
|
||||
PathBuf::from(format!("inmem-{}", delta_filename))
|
||||
}
|
||||
|
||||
fn path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
}
|
||||
@@ -126,18 +133,22 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
let inner = self.inner.read().unwrap();
|
||||
if let Some(end_lsn) = self.end_lsn {
|
||||
return Lsn(end_lsn.0 + 1);
|
||||
}
|
||||
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
end_lsn
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
drop_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.dropped
|
||||
let inner = self.inner.lock().unwrap();
|
||||
inner.drop_lsn.is_some()
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
@@ -151,22 +162,24 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
{
|
||||
let inner = self.inner.read().unwrap();
|
||||
let predecessor: Option<Arc<dyn Layer>>;
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let iter = inner
|
||||
{
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
// Scan the BTreeMap backwards, starting from reconstruct_data.lsn.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let mut iter = inner
|
||||
.page_versions
|
||||
.get_block_lsn_range(blknum, ..=lsn)
|
||||
.iter()
|
||||
.rev();
|
||||
for (entry_lsn, entry) in iter {
|
||||
.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
|
||||
if let Some(img) = &entry.page_image {
|
||||
reconstruct_data.page_img = Some(img.clone());
|
||||
need_image = false;
|
||||
break;
|
||||
} else if let Some(rec) = &entry.record {
|
||||
reconstruct_data.records.push((*entry_lsn, rec.clone()));
|
||||
reconstruct_data.records.push(rec.clone());
|
||||
if rec.will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
@@ -177,14 +190,16 @@ impl Layer for InMemoryLayer {
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
}
|
||||
|
||||
predecessor = inner.predecessor.clone();
|
||||
// release lock on 'inner'
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know
|
||||
// caller know about the predecessor layer.
|
||||
if need_image {
|
||||
if self.incremental {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
if let Some(cont_layer) = predecessor {
|
||||
Ok(PageReconstructResult::Continue(self.start_lsn, cont_layer))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Missing(self.start_lsn))
|
||||
}
|
||||
@@ -196,18 +211,14 @@ impl Layer for InMemoryLayer {
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.lock().unwrap();
|
||||
Ok(inner.get_seg_size(lsn))
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
// If the segment created after requested LSN,
|
||||
// it doesn't exist in the layer. But we shouldn't
|
||||
@@ -215,8 +226,8 @@ impl Layer for InMemoryLayer {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
// Is the requested LSN after the segment was dropped?
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
if lsn >= end_lsn {
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
if lsn >= drop_lsn {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
@@ -239,35 +250,36 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.incremental
|
||||
let inner = self.inner.lock().unwrap();
|
||||
inner.predecessor.is_some()
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
let end_str = inner
|
||||
.end_lsn
|
||||
.drop_lsn
|
||||
.as_ref()
|
||||
.map(Lsn::to_string)
|
||||
.map(|drop_lsn| drop_lsn.to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
println!(
|
||||
"----- in-memory layer for tli {} seg {} {}-{} {} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
|
||||
"----- in-memory layer for tli {} seg {} {}-{} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str
|
||||
);
|
||||
|
||||
for (k, v) in inner.segsizes.as_slice() {
|
||||
for (k, v) in inner.segsizes.iter() {
|
||||
println!("segsizes {}: {}", k, v);
|
||||
}
|
||||
|
||||
for (blknum, lsn, pv) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
for (k, v) in inner.page_versions.iter() {
|
||||
println!(
|
||||
"blk {} at {}: {}/{}\n",
|
||||
blknum,
|
||||
lsn,
|
||||
pv.page_image.is_some(),
|
||||
pv.record.is_some()
|
||||
k.0,
|
||||
k.1,
|
||||
v.page_image.is_some(),
|
||||
v.record.is_some()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -275,19 +287,26 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// A result of an inmemory layer data being written to disk.
|
||||
pub struct LayersOnDisk {
|
||||
pub delta_layers: Vec<DeltaLayer>,
|
||||
pub image_layers: Vec<ImageLayer>,
|
||||
}
|
||||
/// Write failed because the layer is in process of being replaced.
|
||||
/// See [`LayeredTimeline::perform_write_op`] for how to handle this error.
|
||||
#[derive(Debug)]
|
||||
pub struct NonWriteableError;
|
||||
|
||||
impl LayersOnDisk {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.delta_layers.is_empty() && self.image_layers.is_empty()
|
||||
}
|
||||
pub type WriteResult<T> = std::result::Result<T, NonWriteableError>;
|
||||
|
||||
/// Helper struct to cleanup `InMemoryLayer::freeze` return signature.
|
||||
pub struct FreezeLayers {
|
||||
/// Replacement layer for the layer which freeze was called on.
|
||||
pub frozen: Arc<InMemoryLayer>,
|
||||
/// New open layer containing leftover data.
|
||||
pub open: Option<Arc<InMemoryLayer>>,
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
fn assert_not_frozen(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
}
|
||||
|
||||
/// Return the oldest page version that's stored in this layer
|
||||
pub fn get_oldest_pending_lsn(&self) -> Lsn {
|
||||
self.oldest_pending_lsn
|
||||
@@ -311,25 +330,20 @@ impl InMemoryLayer {
|
||||
start_lsn
|
||||
);
|
||||
|
||||
// The segment is initially empty, so initialize 'segsizes' with 0.
|
||||
let mut segsizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
segsizes.append(start_lsn, 0).unwrap();
|
||||
}
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn: None,
|
||||
oldest_pending_lsn,
|
||||
incremental: false,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
page_versions: PageVersions::default(),
|
||||
segsizes,
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
segsizes: BTreeMap::new(),
|
||||
writeable: true,
|
||||
predecessor: None,
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -337,10 +351,10 @@ impl InMemoryLayer {
|
||||
// Write operations
|
||||
|
||||
/// Remember new page version, as a WAL record over previous version
|
||||
pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> u32 {
|
||||
pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> WriteResult<u32> {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
lsn,
|
||||
rec.lsn,
|
||||
PageVersion {
|
||||
page_image: None,
|
||||
record: Some(rec),
|
||||
@@ -349,7 +363,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Remember new page version, as a full page image
|
||||
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> u32 {
|
||||
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> WriteResult<u32> {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
lsn,
|
||||
@@ -362,7 +376,8 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> u32 {
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> WriteResult<u32> {
|
||||
self.assert_not_frozen();
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
trace!(
|
||||
@@ -372,11 +387,11 @@ impl InMemoryLayer {
|
||||
self.timelineid,
|
||||
lsn
|
||||
);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
|
||||
inner.assert_writeable();
|
||||
inner.check_writeable()?;
|
||||
|
||||
let old = inner.page_versions.append_or_update_last(blknum, lsn, pv);
|
||||
let old = inner.page_versions.insert((blknum, lsn), pv);
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
@@ -390,7 +405,7 @@ impl InMemoryLayer {
|
||||
if self.seg.rel.is_blocky() {
|
||||
let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1;
|
||||
|
||||
// use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock,
|
||||
// use inner get_seg_size, since calling self.get_seg_size will try to acquire self.inner.lock
|
||||
// which we've just acquired above
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
if newsize > oldsize {
|
||||
@@ -421,9 +436,7 @@ impl InMemoryLayer {
|
||||
gapblknum,
|
||||
blknum
|
||||
);
|
||||
let old = inner
|
||||
.page_versions
|
||||
.append_or_update_last(gapblknum, lsn, zeropv);
|
||||
let old = inner.page_versions.insert((gapblknum, lsn), zeropv);
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
|
||||
if old.is_some() {
|
||||
@@ -434,47 +447,48 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
|
||||
return newsize - oldsize;
|
||||
inner.segsizes.insert(lsn, newsize);
|
||||
return Ok(newsize - oldsize);
|
||||
}
|
||||
}
|
||||
|
||||
0
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
/// Remember that the relation was truncated at given LSN
|
||||
pub fn put_truncation(&self, lsn: Lsn, segsize: u32) {
|
||||
assert!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"put_truncation() called on a non-blocky rel"
|
||||
);
|
||||
pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> WriteResult<()> {
|
||||
self.assert_not_frozen();
|
||||
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.assert_writeable();
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.check_writeable()?;
|
||||
|
||||
// check that this we truncate to a smaller size than segment was before the truncation
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
assert!(segsize < oldsize);
|
||||
|
||||
let old = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
|
||||
let old = inner.segsizes.insert(lsn, segsize);
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Inserting truncation, but had an entry for the LSN already");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember that the segment was dropped at given LSN
|
||||
pub fn drop_segment(&self, lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
pub fn drop_segment(&self, lsn: Lsn) -> WriteResult<()> {
|
||||
self.assert_not_frozen();
|
||||
|
||||
assert!(inner.end_lsn.is_none());
|
||||
assert!(!inner.dropped);
|
||||
inner.dropped = true;
|
||||
assert!(self.start_lsn < lsn);
|
||||
inner.end_lsn = Some(lsn);
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
|
||||
trace!("dropped segment {} at {}", self.seg, lsn);
|
||||
inner.check_writeable()?;
|
||||
|
||||
assert!(inner.drop_lsn.is_none());
|
||||
inner.drop_lsn = Some(lsn);
|
||||
|
||||
info!("dropped segment {} at {}", self.seg, lsn);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -491,9 +505,6 @@ impl InMemoryLayer {
|
||||
) -> Result<InMemoryLayer> {
|
||||
let seg = src.get_seg_tag();
|
||||
|
||||
assert!(oldest_pending_lsn.is_aligned());
|
||||
assert!(oldest_pending_lsn >= start_lsn);
|
||||
|
||||
trace!(
|
||||
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
|
||||
seg,
|
||||
@@ -501,11 +512,11 @@ impl InMemoryLayer {
|
||||
start_lsn,
|
||||
);
|
||||
|
||||
// Copy the segment size at the start LSN from the predecessor layer.
|
||||
let mut segsizes = VecMap::default();
|
||||
// For convenience, copy the segment size from the predecessor layer
|
||||
let mut segsizes = BTreeMap::new();
|
||||
if seg.rel.is_blocky() {
|
||||
let size = src.get_seg_size(start_lsn)?;
|
||||
segsizes.append(start_lsn, size).unwrap();
|
||||
segsizes.insert(start_lsn, size);
|
||||
}
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
@@ -514,43 +525,117 @@ impl InMemoryLayer {
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn: None,
|
||||
oldest_pending_lsn,
|
||||
incremental: true,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
page_versions: PageVersions::default(),
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
segsizes,
|
||||
writeable: true,
|
||||
predecessor: Some(src),
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_writeable(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.end_lsn.is_none()
|
||||
}
|
||||
/// Splits `self` into two InMemoryLayers: `frozen` and `open`.
|
||||
/// All data up to and including `cutoff_lsn` (or the drop LSN, if dropped)
|
||||
/// is copied to `frozen`, while the remaining data is copied to `open`.
|
||||
/// After completion, self is non-writeable, but not frozen.
|
||||
pub fn freeze(&self, cutoff_lsn: Lsn) -> Result<FreezeLayers> {
|
||||
info!(
|
||||
"freezing in memory layer for {} on timeline {} at {}",
|
||||
self.seg, self.timelineid, cutoff_lsn
|
||||
);
|
||||
|
||||
/// Make the layer non-writeable. Only call once.
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is inclusive
|
||||
pub fn freeze(&self, end_lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
self.assert_not_frozen();
|
||||
|
||||
if inner.end_lsn.is_some() {
|
||||
assert!(inner.dropped);
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
assert!(inner.writeable);
|
||||
inner.writeable = false;
|
||||
|
||||
// Normally, use the cutoff LSN as the end of the frozen layer.
|
||||
// But if the relation was dropped, we know that there are no
|
||||
// more changes coming in for it, and in particular we know that
|
||||
// there are no changes "in flight" for the LSN anymore, so we use
|
||||
// the drop LSN instead. The drop-LSN could be ahead of the
|
||||
// caller-specified LSN!
|
||||
let dropped = inner.drop_lsn.is_some();
|
||||
let end_lsn = if dropped {
|
||||
inner.drop_lsn.unwrap()
|
||||
} else {
|
||||
assert!(!inner.dropped);
|
||||
assert!(self.start_lsn < end_lsn + 1);
|
||||
inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
|
||||
cutoff_lsn
|
||||
};
|
||||
|
||||
if let Some((lsn, _)) = inner.segsizes.as_slice().last() {
|
||||
assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
|
||||
// Divide all the page versions into old and new at the 'end_lsn' cutoff point.
|
||||
let mut before_page_versions;
|
||||
let mut before_segsizes;
|
||||
let mut after_page_versions;
|
||||
let mut after_segsizes;
|
||||
if !dropped {
|
||||
before_segsizes = BTreeMap::new();
|
||||
after_segsizes = BTreeMap::new();
|
||||
for (lsn, size) in inner.segsizes.iter() {
|
||||
if *lsn > end_lsn {
|
||||
after_segsizes.insert(*lsn, *size);
|
||||
} else {
|
||||
before_segsizes.insert(*lsn, *size);
|
||||
}
|
||||
}
|
||||
|
||||
for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
assert!(lsn <= end_lsn);
|
||||
before_page_versions = BTreeMap::new();
|
||||
after_page_versions = BTreeMap::new();
|
||||
for ((blknum, lsn), pv) in inner.page_versions.iter() {
|
||||
if *lsn > end_lsn {
|
||||
after_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
} else {
|
||||
before_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
before_page_versions = inner.page_versions.clone();
|
||||
before_segsizes = inner.segsizes.clone();
|
||||
after_segsizes = BTreeMap::new();
|
||||
after_page_versions = BTreeMap::new();
|
||||
}
|
||||
|
||||
let frozen = Arc::new(InMemoryLayer {
|
||||
conf: self.conf,
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: Some(end_lsn),
|
||||
oldest_pending_lsn: self.start_lsn,
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
drop_lsn: inner.drop_lsn,
|
||||
page_versions: before_page_versions,
|
||||
segsizes: before_segsizes,
|
||||
writeable: false,
|
||||
predecessor: inner.predecessor.clone(),
|
||||
}),
|
||||
});
|
||||
|
||||
let open = if !dropped && (!after_segsizes.is_empty() || !after_page_versions.is_empty()) {
|
||||
let mut new_open = Self::create_successor_layer(
|
||||
self.conf,
|
||||
frozen.clone(),
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
end_lsn,
|
||||
end_lsn,
|
||||
)?;
|
||||
|
||||
let new_inner = new_open.inner.get_mut().unwrap();
|
||||
new_inner.page_versions.append(&mut after_page_versions);
|
||||
new_inner.segsizes.append(&mut after_segsizes);
|
||||
|
||||
Some(Arc::new(new_open))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// TODO could we avoid creating the `frozen` if it contains no data
|
||||
Ok(FreezeLayers { frozen, open })
|
||||
}
|
||||
|
||||
/// Write the this frozen in-memory layer to disk.
|
||||
@@ -561,62 +646,40 @@ impl InMemoryLayer {
|
||||
/// WAL records between start and end LSN. (The delta layer is not needed
|
||||
/// when a new relish is created with a single LSN, so that the start and
|
||||
/// end LSN are the same.)
|
||||
pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<LayersOnDisk> {
|
||||
trace!(
|
||||
"write_to_disk {} get_end_lsn is {}",
|
||||
self.filename().display(),
|
||||
self.get_end_lsn()
|
||||
);
|
||||
pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
let end_lsn = self.end_lsn.expect("can only write frozen layers to disk");
|
||||
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
// though: another thread might have grabbed a reference to this layer
|
||||
// in `get_layer_for_write' just before the checkpointer called
|
||||
// `freeze`, and then `write_to_disk` on it. When the thread gets the
|
||||
// lock, it will see that it's not writeable anymore and retry, but it
|
||||
// would have to wait until we release it. That race condition is very
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
let end_lsn_exclusive = inner.end_lsn.unwrap();
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
if inner.dropped {
|
||||
let delta_layer = DeltaLayer::create(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_exclusive,
|
||||
true,
|
||||
inner.page_versions.ordered_page_version_iter(None),
|
||||
inner.segsizes.clone(),
|
||||
)?;
|
||||
trace!(
|
||||
"freeze: created delta layer for dropped segment {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_exclusive
|
||||
);
|
||||
return Ok(LayersOnDisk {
|
||||
delta_layers: vec![delta_layer],
|
||||
image_layers: Vec::new(),
|
||||
});
|
||||
let drop_lsn = inner.drop_lsn;
|
||||
let predecessor = inner.predecessor.clone();
|
||||
|
||||
let mut before_page_versions;
|
||||
let mut before_segsizes;
|
||||
if inner.drop_lsn.is_none() {
|
||||
before_segsizes = BTreeMap::new();
|
||||
for (lsn, size) in inner.segsizes.iter() {
|
||||
if *lsn <= end_lsn {
|
||||
before_segsizes.insert(*lsn, *size);
|
||||
}
|
||||
}
|
||||
|
||||
before_page_versions = BTreeMap::new();
|
||||
for ((blknum, lsn), pv) in inner.page_versions.iter() {
|
||||
if *lsn < end_lsn {
|
||||
before_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
before_page_versions = inner.page_versions.clone();
|
||||
before_segsizes = inner.segsizes.clone();
|
||||
}
|
||||
|
||||
// Since `end_lsn` is inclusive, subtract 1.
|
||||
// We want to make an ImageLayer for the last included LSN,
|
||||
// so the DeltaLayer should exlcude that LSN.
|
||||
let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
|
||||
drop(inner);
|
||||
|
||||
let mut page_versions = inner
|
||||
.page_versions
|
||||
.ordered_page_version_iter(Some(end_lsn_inclusive));
|
||||
let mut frozen_layers: Vec<Arc<dyn Layer>> = Vec::new();
|
||||
|
||||
let mut delta_layers = Vec::new();
|
||||
|
||||
if self.start_lsn != end_lsn_inclusive {
|
||||
let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive);
|
||||
if self.start_lsn != end_lsn {
|
||||
// Write the page versions before the cutoff to disk.
|
||||
let delta_layer = DeltaLayer::create(
|
||||
self.conf,
|
||||
@@ -624,36 +687,35 @@ impl InMemoryLayer {
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_inclusive,
|
||||
false,
|
||||
page_versions,
|
||||
segsizes,
|
||||
end_lsn,
|
||||
drop_lsn.is_some(),
|
||||
predecessor,
|
||||
before_page_versions,
|
||||
before_segsizes,
|
||||
)?;
|
||||
delta_layers.push(delta_layer);
|
||||
frozen_layers.push(Arc::new(delta_layer));
|
||||
trace!(
|
||||
"freeze: created delta layer {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_inclusive
|
||||
end_lsn
|
||||
);
|
||||
} else {
|
||||
assert!(page_versions.next().is_none());
|
||||
assert!(before_page_versions.is_empty());
|
||||
}
|
||||
|
||||
drop(inner);
|
||||
if drop_lsn.is_none() {
|
||||
// Write a new base image layer at the cutoff point
|
||||
let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
|
||||
frozen_layers.push(Arc::new(image_layer));
|
||||
trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
|
||||
}
|
||||
|
||||
// Write a new base image layer at the cutoff point
|
||||
let image_layer =
|
||||
ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?;
|
||||
trace!(
|
||||
"freeze: created image layer {} at {}",
|
||||
self.seg,
|
||||
end_lsn_inclusive
|
||||
);
|
||||
Ok(frozen_layers)
|
||||
}
|
||||
|
||||
Ok(LayersOnDisk {
|
||||
delta_layers,
|
||||
image_layers: vec![image_layer],
|
||||
})
|
||||
pub fn update_predecessor(&self, predecessor: Arc<dyn Layer>) -> Option<Arc<dyn Layer>> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.predecessor.replace(predecessor)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,468 +0,0 @@
|
||||
///
|
||||
/// IntervalTree is data structure for holding intervals. It is generic
|
||||
/// to make unit testing possible, but the only real user of it is the layer map,
|
||||
///
|
||||
/// It's inspired by the "segment tree" or a "statistic tree" as described in
|
||||
/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold
|
||||
/// the points instead of a binary tree. This is called an "interval tree" instead
|
||||
/// of "segment tree" because the term "segment" is already using Zenith to mean
|
||||
/// something else. To add to the confusion, there is another data structure known
|
||||
/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree),
|
||||
/// for storing intervals, but this isn't that.
|
||||
///
|
||||
/// The basic idea is to have a B-tree of "interesting Points". At each Point,
|
||||
/// there is a list of intervals that contain the point. The Points are formed
|
||||
/// from the start bounds of each interval; there is a Point for each distinct
|
||||
/// start bound.
|
||||
///
|
||||
/// Operations:
|
||||
///
|
||||
/// To find intervals that contain a given point, you search the b-tree to find
|
||||
/// the nearest Point <= search key. Then you just return the list of intervals.
|
||||
///
|
||||
/// To insert an interval, find the Point with start key equal to the inserted item.
|
||||
/// If the Point doesn't exist yet, create it, by copying all the items from the
|
||||
/// previous Point that cover the new Point. Then walk right, inserting the new
|
||||
/// interval to all the Points that are contained by the new interval (including the
|
||||
/// newly created Point).
|
||||
///
|
||||
/// To remove an interval, you scan the tree for all the Points that are contained by
|
||||
/// the removed interval, and remove it from the list in each Point.
|
||||
///
|
||||
/// Requirements and assumptions:
|
||||
///
|
||||
/// - Can store overlapping items
|
||||
/// - But there are not many overlapping items
|
||||
/// - The interval bounds don't change after it is added to the tree
|
||||
/// - Intervals are uniquely identified by pointer equality. You must not be insert the
|
||||
/// same interval object twice, and `remove` uses pointer equality to remove the right
|
||||
/// interval. It is OK to have two intervals with the same bounds, however.
|
||||
///
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct IntervalTree<I: ?Sized>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
points: BTreeMap<I::Key, Point<I>>,
|
||||
}
|
||||
|
||||
struct Point<I: ?Sized> {
|
||||
/// All intervals that contain this point, in no particular order.
|
||||
///
|
||||
/// We assume that there aren't a lot of overlappingg intervals, so that this vector
|
||||
/// never grows very large. If that assumption doesn't hold, we could keep this ordered
|
||||
/// by the end bound, to speed up `search`. But as long as there are only a few elements,
|
||||
/// a linear search is OK.
|
||||
elements: Vec<Arc<I>>,
|
||||
}
|
||||
|
||||
/// Abstraction for an interval that can be stored in the tree
|
||||
///
|
||||
/// The start bound is inclusive and the end bound is exclusive. End must be greater
|
||||
/// than start.
|
||||
pub trait IntervalItem {
|
||||
type Key: Ord + Copy + Debug + Sized;
|
||||
|
||||
fn start_key(&self) -> Self::Key;
|
||||
fn end_key(&self) -> Self::Key;
|
||||
|
||||
fn bounds(&self) -> Range<Self::Key> {
|
||||
self.start_key()..self.end_key()
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: ?Sized> IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
/// Return an element that contains 'key', or precedes it.
|
||||
///
|
||||
/// If there are multiple candidates, returns the one with the highest 'end' key.
|
||||
pub fn search(&self, key: I::Key) -> Option<Arc<I>> {
|
||||
// Find the greatest point that precedes or is equal to the search key. If there is
|
||||
// none, returns None.
|
||||
let (_, p) = self.points.range(..=key).next_back()?;
|
||||
|
||||
// Find the element with the highest end key at this point
|
||||
let highest_item = p
|
||||
.elements
|
||||
.iter()
|
||||
.reduce(|a, b| {
|
||||
// starting with Rust 1.53, could use `std::cmp::min_by_key` here
|
||||
if a.end_key() > b.end_key() {
|
||||
a
|
||||
} else {
|
||||
b
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
Some(Arc::clone(highest_item))
|
||||
}
|
||||
|
||||
/// Iterate over all items with start bound >= 'key'
|
||||
pub fn iter_newer(&self, key: I::Key) -> IntervalIter<I> {
|
||||
IntervalIter {
|
||||
point_iter: self.points.range(key..),
|
||||
elem_iter: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterate over all items
|
||||
pub fn iter(&self) -> IntervalIter<I> {
|
||||
IntervalIter {
|
||||
point_iter: self.points.range(..),
|
||||
elem_iter: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, item: Arc<I>) {
|
||||
let start_key = item.start_key();
|
||||
let end_key = item.end_key();
|
||||
assert!(start_key < end_key);
|
||||
let bounds = start_key..end_key;
|
||||
|
||||
// Find the starting point and walk forward from there
|
||||
let mut found_start_point = false;
|
||||
let iter = self.points.range_mut(bounds);
|
||||
for (point_key, point) in iter {
|
||||
if *point_key == start_key {
|
||||
found_start_point = true;
|
||||
// It is an error to insert the same item to the tree twice.
|
||||
assert!(
|
||||
!point.elements.iter().any(|x| Arc::ptr_eq(x, &item)),
|
||||
"interval is already in the tree"
|
||||
);
|
||||
}
|
||||
point.elements.push(Arc::clone(&item));
|
||||
}
|
||||
if !found_start_point {
|
||||
// Create a new Point for the starting point
|
||||
|
||||
// Look at the previous point, and copy over elements that overlap with this
|
||||
// new point
|
||||
let mut new_elements: Vec<Arc<I>> = Vec::new();
|
||||
if let Some((_, prev_point)) = self.points.range(..start_key).next_back() {
|
||||
let overlapping_prev_elements = prev_point
|
||||
.elements
|
||||
.iter()
|
||||
.filter(|x| x.bounds().contains(&start_key))
|
||||
.cloned();
|
||||
|
||||
new_elements.extend(overlapping_prev_elements);
|
||||
}
|
||||
new_elements.push(item);
|
||||
|
||||
let new_point = Point {
|
||||
elements: new_elements,
|
||||
};
|
||||
self.points.insert(start_key, new_point);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, item: &Arc<I>) {
|
||||
// range search points
|
||||
let start_key = item.start_key();
|
||||
let end_key = item.end_key();
|
||||
let bounds = start_key..end_key;
|
||||
|
||||
let mut points_to_remove: Vec<I::Key> = Vec::new();
|
||||
let mut found_start_point = false;
|
||||
for (point_key, point) in self.points.range_mut(bounds) {
|
||||
if *point_key == start_key {
|
||||
found_start_point = true;
|
||||
}
|
||||
let len_before = point.elements.len();
|
||||
point.elements.retain(|other| !Arc::ptr_eq(other, item));
|
||||
let len_after = point.elements.len();
|
||||
assert_eq!(len_after + 1, len_before);
|
||||
if len_after == 0 {
|
||||
points_to_remove.push(*point_key);
|
||||
}
|
||||
}
|
||||
assert!(found_start_point);
|
||||
|
||||
for k in points_to_remove {
|
||||
self.points.remove(&k).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IntervalIter<'a, I: ?Sized>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
point_iter: std::collections::btree_map::Range<'a, I::Key, Point<I>>,
|
||||
elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc<I>>)>,
|
||||
}
|
||||
|
||||
impl<'a, I> Iterator for IntervalIter<'a, I>
|
||||
where
|
||||
I: IntervalItem + ?Sized,
|
||||
{
|
||||
type Item = Arc<I>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// Iterate over all elements in all the points in 'point_iter'. To avoid
|
||||
// returning the same element twice, we only return each element at its
|
||||
// starting point.
|
||||
loop {
|
||||
// Return next remaining element from the current point
|
||||
if let Some((point_key, elem_iter)) = &mut self.elem_iter {
|
||||
for elem in elem_iter {
|
||||
if elem.start_key() == *point_key {
|
||||
return Some(Arc::clone(elem));
|
||||
}
|
||||
}
|
||||
}
|
||||
// No more elements at this point. Move to next point.
|
||||
if let Some((point_key, point)) = self.point_iter.next() {
|
||||
self.elem_iter = Some((*point_key, point.elements.iter()));
|
||||
continue;
|
||||
} else {
|
||||
// No more points, all done
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: ?Sized> Default for IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
fn default() -> Self {
|
||||
IntervalTree {
|
||||
points: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct MockItem {
|
||||
start_key: u32,
|
||||
end_key: u32,
|
||||
val: String,
|
||||
}
|
||||
impl IntervalItem for MockItem {
|
||||
type Key = u32;
|
||||
|
||||
fn start_key(&self) -> u32 {
|
||||
self.start_key
|
||||
}
|
||||
fn end_key(&self) -> u32 {
|
||||
self.end_key
|
||||
}
|
||||
}
|
||||
impl MockItem {
|
||||
fn new(start_key: u32, end_key: u32) -> Self {
|
||||
MockItem {
|
||||
start_key,
|
||||
end_key,
|
||||
val: format!("{}-{}", start_key, end_key),
|
||||
}
|
||||
}
|
||||
fn new_str(start_key: u32, end_key: u32, val: &str) -> Self {
|
||||
MockItem {
|
||||
start_key,
|
||||
end_key,
|
||||
val: format!("{}-{}: {}", start_key, end_key, val),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl fmt::Display for MockItem {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.val)
|
||||
}
|
||||
}
|
||||
#[rustfmt::skip]
|
||||
fn assert_search(
|
||||
tree: &IntervalTree<MockItem>,
|
||||
key: u32,
|
||||
expected: &[&str],
|
||||
) -> Option<Arc<MockItem>> {
|
||||
if let Some(v) = tree.search(key) {
|
||||
let vstr = v.to_string();
|
||||
|
||||
assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
|
||||
assert!(
|
||||
expected.contains(&vstr.as_str()),
|
||||
"search with {} returned {}, expected one of: {:?}",
|
||||
key, v, expected,
|
||||
);
|
||||
|
||||
Some(v)
|
||||
} else {
|
||||
assert!(
|
||||
expected.is_empty(),
|
||||
"search with {} returned None, expected one of {:?}",
|
||||
key, expected
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_contents(tree: &IntervalTree<MockItem>, expected: &[&str]) {
|
||||
let mut contents: Vec<String> = tree.iter().map(|e| e.to_string()).collect();
|
||||
contents.sort();
|
||||
assert_eq!(contents, expected);
|
||||
}
|
||||
|
||||
fn dump_tree(tree: &IntervalTree<MockItem>) {
|
||||
for (point_key, point) in tree.points.iter() {
|
||||
print!("{}:", point_key);
|
||||
for e in point.elements.iter() {
|
||||
print!(" {}", e);
|
||||
}
|
||||
println!();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_simple() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Simple, non-overlapping ranges.
|
||||
tree.insert(Arc::new(MockItem::new(10, 11)));
|
||||
tree.insert(Arc::new(MockItem::new(11, 12)));
|
||||
tree.insert(Arc::new(MockItem::new(12, 13)));
|
||||
tree.insert(Arc::new(MockItem::new(18, 19)));
|
||||
tree.insert(Arc::new(MockItem::new(17, 18)));
|
||||
tree.insert(Arc::new(MockItem::new(15, 16)));
|
||||
|
||||
assert_search(&tree, 9, &[]);
|
||||
assert_search(&tree, 10, &["10-11"]);
|
||||
assert_search(&tree, 11, &["11-12"]);
|
||||
assert_search(&tree, 12, &["12-13"]);
|
||||
assert_search(&tree, 13, &["12-13"]);
|
||||
assert_search(&tree, 14, &["12-13"]);
|
||||
assert_search(&tree, 15, &["15-16"]);
|
||||
assert_search(&tree, 16, &["15-16"]);
|
||||
assert_search(&tree, 17, &["17-18"]);
|
||||
assert_search(&tree, 18, &["18-19"]);
|
||||
assert_search(&tree, 19, &["18-19"]);
|
||||
assert_search(&tree, 20, &["18-19"]);
|
||||
|
||||
// remove a few entries and search around them again
|
||||
tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry
|
||||
tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle
|
||||
tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry
|
||||
assert_search(&tree, 9, &[]);
|
||||
assert_search(&tree, 10, &[]);
|
||||
assert_search(&tree, 11, &["11-12"]);
|
||||
assert_search(&tree, 12, &["11-12"]);
|
||||
assert_search(&tree, 14, &["11-12"]);
|
||||
assert_search(&tree, 15, &["15-16"]);
|
||||
assert_search(&tree, 17, &["17-18"]);
|
||||
assert_search(&tree, 18, &["17-18"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_overlap() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Overlapping items
|
||||
tree.insert(Arc::new(MockItem::new(22, 24)));
|
||||
tree.insert(Arc::new(MockItem::new(23, 25)));
|
||||
let x24_26 = Arc::new(MockItem::new(24, 26));
|
||||
tree.insert(Arc::clone(&x24_26));
|
||||
let x26_28 = Arc::new(MockItem::new(26, 28));
|
||||
tree.insert(Arc::clone(&x26_28));
|
||||
tree.insert(Arc::new(MockItem::new(25, 27)));
|
||||
|
||||
assert_search(&tree, 22, &["22-24"]);
|
||||
assert_search(&tree, 23, &["22-24", "23-25"]);
|
||||
assert_search(&tree, 24, &["23-25", "24-26"]);
|
||||
assert_search(&tree, 25, &["24-26", "25-27"]);
|
||||
assert_search(&tree, 26, &["25-27", "26-28"]);
|
||||
assert_search(&tree, 27, &["26-28"]);
|
||||
assert_search(&tree, 28, &["26-28"]);
|
||||
assert_search(&tree, 29, &["26-28"]);
|
||||
|
||||
tree.remove(&x24_26);
|
||||
tree.remove(&x26_28);
|
||||
assert_search(&tree, 23, &["22-24", "23-25"]);
|
||||
assert_search(&tree, 24, &["23-25"]);
|
||||
assert_search(&tree, 25, &["25-27"]);
|
||||
assert_search(&tree, 26, &["25-27"]);
|
||||
assert_search(&tree, 27, &["25-27"]);
|
||||
assert_search(&tree, 28, &["25-27"]);
|
||||
assert_search(&tree, 29, &["25-27"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_nested() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Items containing other items
|
||||
tree.insert(Arc::new(MockItem::new(31, 39)));
|
||||
tree.insert(Arc::new(MockItem::new(32, 34)));
|
||||
tree.insert(Arc::new(MockItem::new(33, 35)));
|
||||
tree.insert(Arc::new(MockItem::new(30, 40)));
|
||||
|
||||
assert_search(&tree, 30, &["30-40"]);
|
||||
assert_search(&tree, 31, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 32, &["30-40", "32-34", "31-39"]);
|
||||
assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]);
|
||||
assert_search(&tree, 34, &["30-40", "33-35", "31-39"]);
|
||||
assert_search(&tree, 35, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 36, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 37, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 38, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 39, &["30-40"]);
|
||||
assert_search(&tree, 40, &["30-40"]);
|
||||
assert_search(&tree, 41, &["30-40"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_duplicates() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Duplicate keys
|
||||
let item_a = Arc::new(MockItem::new_str(55, 56, "a"));
|
||||
tree.insert(Arc::clone(&item_a));
|
||||
let item_b = Arc::new(MockItem::new_str(55, 56, "b"));
|
||||
tree.insert(Arc::clone(&item_b));
|
||||
let item_c = Arc::new(MockItem::new_str(55, 56, "c"));
|
||||
tree.insert(Arc::clone(&item_c));
|
||||
let item_d = Arc::new(MockItem::new_str(54, 56, "d"));
|
||||
tree.insert(Arc::clone(&item_d));
|
||||
let item_e = Arc::new(MockItem::new_str(55, 57, "e"));
|
||||
tree.insert(Arc::clone(&item_e));
|
||||
|
||||
dump_tree(&tree);
|
||||
|
||||
assert_search(
|
||||
&tree,
|
||||
55,
|
||||
&["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"],
|
||||
);
|
||||
tree.remove(&item_b);
|
||||
dump_tree(&tree);
|
||||
|
||||
assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]);
|
||||
|
||||
tree.remove(&item_d);
|
||||
dump_tree(&tree);
|
||||
assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_interval_tree_insert_twice() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Inserting the same item twice is not cool
|
||||
let item = Arc::new(MockItem::new(1, 2));
|
||||
tree.insert(Arc::clone(&item));
|
||||
tree.insert(Arc::clone(&item)); // fails assertion
|
||||
}
|
||||
}
|
||||
@@ -9,14 +9,13 @@
|
||||
//! new image and delta layers and corresponding files are written to disk.
|
||||
//!
|
||||
|
||||
use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree};
|
||||
use crate::layered_repository::storage_layer::{Layer, SegmentTag};
|
||||
use crate::layered_repository::InMemoryLayer;
|
||||
use crate::relish::*;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::collections::{BTreeMap, BinaryHeap, HashMap};
|
||||
use std::sync::Arc;
|
||||
use zenith_metrics::{register_int_gauge, IntGauge};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -77,14 +76,7 @@ impl LayerMap {
|
||||
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
|
||||
segentry.update_open(Arc::clone(&layer));
|
||||
|
||||
let oldest_pending_lsn = layer.get_oldest_pending_lsn();
|
||||
|
||||
// After a crash and restart, 'oldest_pending_lsn' of the oldest in-memory
|
||||
// layer becomes the WAL streaming starting point, so it better not point
|
||||
// in the middle of a WAL record.
|
||||
assert!(oldest_pending_lsn.is_aligned());
|
||||
segentry.insert_open(Arc::clone(&layer));
|
||||
|
||||
// Also add it to the binary heap
|
||||
let open_layer_entry = OpenLayerEntry {
|
||||
@@ -105,14 +97,11 @@ impl LayerMap {
|
||||
|
||||
// Also remove it from the SegEntry of this segment
|
||||
let mut segentry = self.segs.get_mut(&segtag).unwrap();
|
||||
if Arc::ptr_eq(segentry.open.as_ref().unwrap(), &oldest_entry.layer) {
|
||||
segentry.open = None;
|
||||
} else {
|
||||
// We could have already updated segentry.open for
|
||||
// dropped (non-writeable) layer. This is fine.
|
||||
assert!(!oldest_entry.layer.is_writeable());
|
||||
assert!(oldest_entry.layer.is_dropped());
|
||||
}
|
||||
assert!(Arc::ptr_eq(
|
||||
segentry.open.as_ref().unwrap(),
|
||||
&oldest_entry.layer
|
||||
));
|
||||
segentry.open = None;
|
||||
|
||||
NUM_INMEMORY_LAYERS.dec();
|
||||
}
|
||||
@@ -132,11 +121,12 @@ impl LayerMap {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
pub fn remove_historic(&mut self, layer: &dyn Layer) {
|
||||
let tag = layer.get_seg_tag();
|
||||
let start_lsn = layer.get_start_lsn();
|
||||
|
||||
if let Some(segentry) = self.segs.get_mut(&tag) {
|
||||
segentry.historic.remove(&layer);
|
||||
segentry.historic.remove(&start_lsn);
|
||||
}
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
@@ -154,7 +144,7 @@ impl LayerMap {
|
||||
if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode)
|
||||
&& (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode)
|
||||
{
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn) {
|
||||
rels.insert(seg.rel, exists);
|
||||
}
|
||||
}
|
||||
@@ -162,7 +152,7 @@ impl LayerMap {
|
||||
}
|
||||
_ => {
|
||||
if tag == None {
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn) {
|
||||
rels.insert(seg.rel, exists);
|
||||
}
|
||||
}
|
||||
@@ -184,20 +174,6 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
/// Is there any layer for given segment that is alive at the lsn?
|
||||
///
|
||||
/// This is a public wrapper for SegEntry fucntion,
|
||||
/// used for garbage collection, to determine if some alive layer
|
||||
/// exists at the lsn. If so, we shouldn't delete a newer dropped layer
|
||||
/// to avoid incorrectly making it visible.
|
||||
pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
|
||||
Ok(if let Some(segentry) = self.segs.get(&seg) {
|
||||
segentry.exists_at_lsn(lsn)?.unwrap_or(false)
|
||||
} else {
|
||||
false
|
||||
})
|
||||
}
|
||||
|
||||
/// Return the oldest in-memory layer, along with its generation number.
|
||||
pub fn peek_oldest_open(&self) -> Option<(Arc<InMemoryLayer>, u64)> {
|
||||
self.open_layers
|
||||
@@ -215,7 +191,7 @@ impl LayerMap {
|
||||
|
||||
pub fn iter_historic_layers(&self) -> HistoricLayerIter {
|
||||
HistoricLayerIter {
|
||||
seg_iter: self.segs.iter(),
|
||||
segiter: self.segs.iter(),
|
||||
iter: None,
|
||||
}
|
||||
}
|
||||
@@ -229,7 +205,7 @@ impl LayerMap {
|
||||
open.dump()?;
|
||||
}
|
||||
|
||||
for layer in segentry.historic.iter() {
|
||||
for (_, layer) in segentry.historic.iter() {
|
||||
layer.dump()?;
|
||||
}
|
||||
}
|
||||
@@ -238,40 +214,34 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
impl IntervalItem for dyn Layer {
|
||||
type Key = Lsn;
|
||||
|
||||
fn start_key(&self) -> Lsn {
|
||||
self.get_start_lsn()
|
||||
}
|
||||
fn end_key(&self) -> Lsn {
|
||||
self.get_end_lsn()
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Per-segment entry in the LayerMap::segs hash map. Holds all the layers
|
||||
/// associated with the segment.
|
||||
///
|
||||
/// The last layer that is open for writes is always an InMemoryLayer,
|
||||
/// and is kept in a separate field, because there can be only one for
|
||||
/// each segment. The older layers, stored on disk, are kept in an
|
||||
/// IntervalTree.
|
||||
/// each segment. The older layers, stored on disk, are kept in a
|
||||
/// BTreeMap keyed by the layer's start LSN.
|
||||
#[derive(Default)]
|
||||
struct SegEntry {
|
||||
open: Option<Arc<InMemoryLayer>>,
|
||||
historic: IntervalTree<dyn Layer>,
|
||||
pub open: Option<Arc<InMemoryLayer>>,
|
||||
pub historic: BTreeMap<Lsn, Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
impl SegEntry {
|
||||
/// Does the segment exist at given LSN?
|
||||
/// Return None if object is not found in this SegEntry.
|
||||
fn exists_at_lsn(&self, lsn: Lsn) -> Result<Option<bool>> {
|
||||
if let Some(layer) = self.get(lsn) {
|
||||
Ok(Some(layer.get_seg_exists(lsn)?))
|
||||
} else {
|
||||
Ok(None)
|
||||
fn exists_at_lsn(&self, lsn: Lsn) -> Option<bool> {
|
||||
if let Some(layer) = &self.open {
|
||||
if layer.get_start_lsn() <= lsn && lsn <= layer.get_end_lsn() {
|
||||
let exists = layer.get_seg_exists(lsn).ok()?;
|
||||
return Some(exists);
|
||||
}
|
||||
} else if let Some((_, layer)) = self.historic.range(..=lsn).next_back() {
|
||||
let exists = layer.get_seg_exists(lsn).ok()?;
|
||||
return Some(exists);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
@@ -282,30 +252,40 @@ impl SegEntry {
|
||||
}
|
||||
}
|
||||
|
||||
self.historic.search(lsn)
|
||||
if let Some((_start_lsn, layer)) = self.historic.range(..=lsn).next_back() {
|
||||
Some(Arc::clone(layer))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn newer_image_layer_exists(&self, lsn: Lsn) -> bool {
|
||||
// We only check on-disk layers, because
|
||||
// in-memory layers are not durable
|
||||
|
||||
self.historic
|
||||
.iter_newer(lsn)
|
||||
.any(|layer| !layer.is_incremental())
|
||||
for (_newer_lsn, layer) in self.historic.range(lsn..) {
|
||||
// Ignore incremental layers.
|
||||
if layer.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if layer.get_end_lsn() > lsn {
|
||||
return true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
// Set new open layer for a SegEntry.
|
||||
// It's ok to rewrite previous open layer,
|
||||
// but only if it is not writeable anymore.
|
||||
pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
if let Some(prev_open) = &self.open {
|
||||
assert!(!prev_open.is_writeable());
|
||||
}
|
||||
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
assert!(self.open.is_none());
|
||||
self.open = Some(layer);
|
||||
}
|
||||
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
self.historic.insert(layer);
|
||||
let start_lsn = layer.get_start_lsn();
|
||||
|
||||
self.historic.insert(start_lsn, layer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -344,8 +324,8 @@ impl Eq for OpenLayerEntry {}
|
||||
|
||||
/// Iterator returned by LayerMap::iter_historic_layers()
|
||||
pub struct HistoricLayerIter<'a> {
|
||||
seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
|
||||
iter: Option<IntervalIter<'a, dyn Layer>>,
|
||||
segiter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
|
||||
iter: Option<std::collections::btree_map::Iter<'a, Lsn, Arc<dyn Layer>>>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for HistoricLayerIter<'a> {
|
||||
@@ -355,11 +335,11 @@ impl<'a> Iterator for HistoricLayerIter<'a> {
|
||||
loop {
|
||||
if let Some(x) = &mut self.iter {
|
||||
if let Some(x) = x.next() {
|
||||
return Some(Arc::clone(&x));
|
||||
return Some(Arc::clone(&*x.1));
|
||||
}
|
||||
}
|
||||
if let Some((_tag, segentry)) = self.seg_iter.next() {
|
||||
self.iter = Some(segentry.historic.iter());
|
||||
if let Some(seg) = self.segiter.next() {
|
||||
self.iter = Some(seg.1.historic.iter());
|
||||
continue;
|
||||
} else {
|
||||
return None;
|
||||
@@ -414,14 +394,14 @@ mod tests {
|
||||
let mut layers = LayerMap::default();
|
||||
|
||||
let gen1 = layers.increment_generation();
|
||||
layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(100), Lsn(100)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(100), Lsn(200)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(100), Lsn(120)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(100), Lsn(110)));
|
||||
|
||||
let gen2 = layers.increment_generation();
|
||||
layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(100), Lsn(110)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(100), Lsn(100)));
|
||||
|
||||
// A helper function (closure) to pop the next oldest open entry from the layer map,
|
||||
// and assert that it is what we'd expect
|
||||
@@ -432,12 +412,12 @@ mod tests {
|
||||
layers.pop_oldest_open();
|
||||
};
|
||||
|
||||
assert_pop_layer(0, gen1); // 0x100
|
||||
assert_pop_layer(5, gen2); // 0x100
|
||||
assert_pop_layer(3, gen1); // 0x110
|
||||
assert_pop_layer(4, gen2); // 0x110
|
||||
assert_pop_layer(2, gen1); // 0x120
|
||||
assert_pop_layer(1, gen1); // 0x200
|
||||
assert_pop_layer(0, gen1); // 100
|
||||
assert_pop_layer(5, gen2); // 100
|
||||
assert_pop_layer(3, gen1); // 110
|
||||
assert_pop_layer(4, gen2); // 110
|
||||
assert_pop_layer(2, gen1); // 120
|
||||
assert_pop_layer(1, gen1); // 200
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,150 +0,0 @@
|
||||
use std::{collections::HashMap, ops::RangeBounds, slice};
|
||||
|
||||
use zenith_utils::{lsn::Lsn, vec_map::VecMap};
|
||||
|
||||
use super::storage_layer::PageVersion;
|
||||
|
||||
const EMPTY_SLICE: &[(Lsn, PageVersion)] = &[];
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct PageVersions(HashMap<u32, VecMap<Lsn, PageVersion>>);
|
||||
|
||||
impl PageVersions {
|
||||
pub fn append_or_update_last(
|
||||
&mut self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
page_version: PageVersion,
|
||||
) -> Option<PageVersion> {
|
||||
let map = self.0.entry(blknum).or_insert_with(VecMap::default);
|
||||
map.append_or_update_last(lsn, page_version).unwrap()
|
||||
}
|
||||
|
||||
/// Get all [`PageVersion`]s in a block
|
||||
pub fn get_block_slice(&self, blknum: u32) -> &[(Lsn, PageVersion)] {
|
||||
self.0
|
||||
.get(&blknum)
|
||||
.map(VecMap::as_slice)
|
||||
.unwrap_or(EMPTY_SLICE)
|
||||
}
|
||||
|
||||
/// Get a range of [`PageVersions`] in a block
|
||||
pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
range: R,
|
||||
) -> &[(Lsn, PageVersion)] {
|
||||
self.0
|
||||
.get(&blknum)
|
||||
.map(|vec_map| vec_map.slice_range(range))
|
||||
.unwrap_or(EMPTY_SLICE)
|
||||
}
|
||||
|
||||
/// Iterate through [`PageVersion`]s in (block, lsn) order.
|
||||
/// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
|
||||
pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
|
||||
let mut ordered_blocks: Vec<u32> = self.0.keys().cloned().collect();
|
||||
ordered_blocks.sort_unstable();
|
||||
|
||||
let slice = ordered_blocks
|
||||
.first()
|
||||
.map(|&blknum| self.get_block_slice(blknum))
|
||||
.unwrap_or(EMPTY_SLICE);
|
||||
|
||||
OrderedPageVersionIter {
|
||||
page_versions: self,
|
||||
ordered_blocks,
|
||||
cur_block_idx: 0,
|
||||
cutoff_lsn,
|
||||
cur_slice_iter: slice.iter(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OrderedPageVersionIter<'a> {
|
||||
page_versions: &'a PageVersions,
|
||||
|
||||
ordered_blocks: Vec<u32>,
|
||||
cur_block_idx: usize,
|
||||
|
||||
cutoff_lsn: Option<Lsn>,
|
||||
|
||||
cur_slice_iter: slice::Iter<'a, (Lsn, PageVersion)>,
|
||||
}
|
||||
|
||||
impl OrderedPageVersionIter<'_> {
|
||||
fn is_lsn_before_cutoff(&self, lsn: &Lsn) -> bool {
|
||||
if let Some(cutoff_lsn) = self.cutoff_lsn.as_ref() {
|
||||
lsn < cutoff_lsn
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for OrderedPageVersionIter<'a> {
|
||||
type Item = (u32, Lsn, &'a PageVersion);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some((lsn, page_version)) = self.cur_slice_iter.next() {
|
||||
if self.is_lsn_before_cutoff(lsn) {
|
||||
let blknum = self.ordered_blocks[self.cur_block_idx];
|
||||
return Some((blknum, *lsn, page_version));
|
||||
}
|
||||
}
|
||||
|
||||
let next_block_idx = self.cur_block_idx + 1;
|
||||
let blknum: u32 = *self.ordered_blocks.get(next_block_idx)?;
|
||||
self.cur_block_idx = next_block_idx;
|
||||
self.cur_slice_iter = self.page_versions.get_block_slice(blknum).iter();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
const EMPTY_PAGE_VERSION: PageVersion = PageVersion {
|
||||
page_image: None,
|
||||
record: None,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_ordered_iter() {
|
||||
let mut page_versions = PageVersions::default();
|
||||
const BLOCKS: u32 = 1000;
|
||||
const LSNS: u64 = 50;
|
||||
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..LSNS {
|
||||
let old = page_versions.append_or_update_last(blknum, Lsn(lsn), EMPTY_PAGE_VERSION);
|
||||
assert!(old.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
let mut iter = page_versions.ordered_page_version_iter(None);
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..LSNS {
|
||||
let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
|
||||
assert_eq!(actual_blknum, blknum);
|
||||
assert_eq!(Lsn(lsn), actual_lsn);
|
||||
}
|
||||
}
|
||||
assert!(iter.next().is_none());
|
||||
assert!(iter.next().is_none()); // should be robust against excessive next() calls
|
||||
|
||||
const CUTOFF_LSN: Lsn = Lsn(30);
|
||||
let mut iter = page_versions.ordered_page_version_iter(Some(CUTOFF_LSN));
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..CUTOFF_LSN.0 {
|
||||
let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
|
||||
assert_eq!(actual_blknum, blknum);
|
||||
assert_eq!(Lsn(lsn), actual_lsn);
|
||||
}
|
||||
}
|
||||
assert!(iter.next().is_none());
|
||||
assert!(iter.next().is_none()); // should be robust against excessive next() calls
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@ use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
@@ -78,7 +79,7 @@ pub struct PageVersion {
|
||||
/// 'records' contains the records to apply over the base image.
|
||||
///
|
||||
pub struct PageReconstructData {
|
||||
pub records: Vec<(Lsn, WALRecord)>,
|
||||
pub records: Vec<WALRecord>,
|
||||
pub page_img: Option<Bytes>,
|
||||
}
|
||||
|
||||
@@ -86,9 +87,9 @@ pub struct PageReconstructData {
|
||||
pub enum PageReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
Complete,
|
||||
/// This layer didn't contain all the required data, the caller should look up
|
||||
/// the predecessor layer at the returned LSN and collect more data from there.
|
||||
Continue(Lsn),
|
||||
/// This layer didn't contain all the required data, the caller should collect
|
||||
/// more data from the returned predecessor layer at the returned LSN.
|
||||
Continue(Lsn, Arc<dyn Layer>),
|
||||
/// This layer didn't contain data needed to reconstruct the page version at
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
@@ -110,19 +111,24 @@ pub trait Layer: Send + Sync {
|
||||
/// Identify the relish segment
|
||||
fn get_seg_tag(&self) -> SegmentTag;
|
||||
|
||||
/// Inclusive start bound of the LSN range that this layer holds
|
||||
/// Inclusive start bound of the LSN range that this layer hold
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
/// Exclusive end bound of the LSN range that this layer holds.
|
||||
/// 'end_lsn' meaning depends on the layer kind:
|
||||
/// - in-memory layer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn)
|
||||
/// - image layer represents snapshot at one LSN, so end_lsn = lsn
|
||||
/// - delta layer has end_lsn
|
||||
///
|
||||
/// - For an open in-memory layer, this is MAX_LSN.
|
||||
/// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
/// TODO Is end_lsn always exclusive for all layer kinds?
|
||||
fn get_end_lsn(&self) -> Lsn;
|
||||
|
||||
/// Is the segment represented by this layer dropped by PostgreSQL?
|
||||
fn is_dropped(&self) -> bool;
|
||||
|
||||
/// Gets the physical location of the layer on disk.
|
||||
/// Some layers, such as in-memory, might not have the location.
|
||||
fn path(&self) -> Option<PathBuf>;
|
||||
|
||||
/// Filename used to store this layer on disk. (Even in-memory layers
|
||||
/// implement this, to print a handy unique identifier for the layer for
|
||||
/// log messages, even though they're never not on disk.)
|
||||
@@ -140,8 +146,8 @@ pub trait Layer: Send + Sync {
|
||||
///
|
||||
/// See PageReconstructResult for possible return values. The collected data
|
||||
/// is appended to reconstruct_data; the caller should pass an empty struct
|
||||
/// on first call. If this returns PageReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data'
|
||||
/// on first call. If this returns PageReconstructResult::Continue, call
|
||||
/// again on the returned predecessor layer with the same 'reconstruct_data'
|
||||
/// to collect more data.
|
||||
fn get_page_reconstruct_data(
|
||||
&self,
|
||||
|
||||
@@ -13,7 +13,7 @@ pub mod http;
|
||||
pub mod layered_repository;
|
||||
pub mod page_service;
|
||||
pub mod relish;
|
||||
pub mod relish_storage;
|
||||
mod relish_storage;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod tenant_mgr;
|
||||
@@ -22,13 +22,12 @@ pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
|
||||
pub mod defaults {
|
||||
use const_format::formatcp;
|
||||
use std::time::Duration;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = "127.0.0.1:64000"; // can't format! const yet...
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = "127.0.0.1:9898";
|
||||
|
||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
@@ -40,7 +39,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -126,6 +124,10 @@ impl PageServerConf {
|
||||
self.timeline_path(timelineid, tenantid).join("ancestor")
|
||||
}
|
||||
|
||||
fn wal_dir_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.timeline_path(timelineid, tenantid).join("wal")
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
@@ -151,8 +153,8 @@ impl PageServerConf {
|
||||
checkpoint_period: Duration::from_secs(10),
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
listen_pg_addr: "127.0.0.1:5430".to_string(),
|
||||
listen_http_addr: "127.0.0.1:9898".to_string(),
|
||||
superuser: "zenith_admin".to_string(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: "".into(),
|
||||
@@ -165,37 +167,18 @@ impl PageServerConf {
|
||||
|
||||
/// External relish storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RelishStorageConfig {
|
||||
/// Limits the number of concurrent sync operations between pageserver and relish storage.
|
||||
pub max_concurrent_sync: usize,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RelishStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a relish storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum RelishStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored relish data into.
|
||||
pub enum RelishStorageConfig {
|
||||
/// Root folder to place all stored relish data into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all relishes into the root
|
||||
/// of the S3 bucket from the config.
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// "Login" to use when connecting to bucket.
|
||||
/// Can be empty for cases like AWS k8s IAM
|
||||
/// where we can allow certain pods to connect
|
||||
/// to the bucket directly without any credentials.
|
||||
pub access_key_id: Option<String>,
|
||||
/// "Password" to use when connecting to bucket.
|
||||
pub secret_access_key: Option<String>,
|
||||
}
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
use anyhow::{anyhow, bail, ensure, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::net::TcpListener;
|
||||
use std::str;
|
||||
@@ -20,12 +21,10 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::{io, net::TcpStream};
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::auth::{self, JwtAuth};
|
||||
use zenith_utils::auth::{Claims, Scope};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::is_socket_read_timed_out;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::postgres_backend::{self, AuthType};
|
||||
use zenith_utils::pq_proto::{
|
||||
@@ -188,32 +187,17 @@ pub fn thread_main(
|
||||
listener: TcpListener,
|
||||
auth_type: AuthType,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut join_handles = Vec::new();
|
||||
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept()?;
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
let local_auth = auth.clone();
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name("serving Page Service thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
|
||||
error!(%err, "page server thread exited with error");
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
join_handles.push(handle);
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
|
||||
error!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
debug!("page_service loop terminated. wait for connections to cancel");
|
||||
for handle in join_handles.into_iter() {
|
||||
handle.join().unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn page_service_conn_main(
|
||||
@@ -232,7 +216,7 @@ fn page_service_conn_main(
|
||||
}
|
||||
|
||||
let mut conn_handler = PageServerHandler::new(conf, auth);
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
@@ -276,66 +260,48 @@ impl PageServerHandler {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
/* switch client to COPYBOTH */
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
match pgb.read_message() {
|
||||
Ok(message) => {
|
||||
if let Some(message) = message {
|
||||
trace!("query: {:?}", message);
|
||||
while let Some(message) = pgb.read_message()? {
|
||||
trace!("query({:?}): {:?}", timelineid, message);
|
||||
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
_ => continue,
|
||||
};
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
|
||||
let response = match zenith_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_exists"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_rel_exists_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_size"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_nblocks_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_page_at_lsn"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(&*timeline, &req)
|
||||
}),
|
||||
};
|
||||
let response = match zenith_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_exists"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_rel_exists_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_size"])
|
||||
.observe_closure_duration(|| self.handle_get_nblocks_request(&*timeline, &req)),
|
||||
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_page_at_lsn"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(&*timeline, &req)
|
||||
}),
|
||||
};
|
||||
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough
|
||||
error!("error reading relation or page version: {:#}", e);
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
error!("error reading relation or page version: {}", e);
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
|
||||
pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if !is_socket_read_timed_out(&e) {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -395,8 +361,6 @@ impl PageServerHandler {
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamExistsRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
@@ -412,7 +376,6 @@ impl PageServerHandler {
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamNblocksRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
@@ -432,8 +395,6 @@ impl PageServerHandler {
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
|
||||
.entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
@@ -451,20 +412,17 @@ impl PageServerHandler {
|
||||
lsn: Option<Lsn>,
|
||||
tenantid: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
|
||||
let _enter = span.enter();
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
// switch client to COPYOUT
|
||||
/* switch client to COPYOUT */
|
||||
pgb.write_message(&BeMessage::CopyOutResponse)?;
|
||||
info!("sent CopyOut");
|
||||
|
||||
/* Send a tarball of the latest layer on the timeline */
|
||||
{
|
||||
let mut writer = CopyDataSink { pgb };
|
||||
let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
|
||||
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
||||
basebackup.send_tarball()?;
|
||||
}
|
||||
pgb.write_message(&BeMessage::CopyDone)?;
|
||||
@@ -569,6 +527,11 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
None
|
||||
};
|
||||
|
||||
info!(
|
||||
"got basebackup command. tenantid=\"{}\" timelineid=\"{}\" lsn=\"{:#?}\"",
|
||||
tenantid, timelineid, lsn
|
||||
);
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
@@ -586,9 +549,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
self.check_permission(Some(tenantid))?;
|
||||
|
||||
let _enter =
|
||||
info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
@@ -611,9 +571,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
self.check_permission(Some(tenantid))?;
|
||||
|
||||
let _enter =
|
||||
info_span!("branch_create", name = %branchname, tenant = %tenantid).entered();
|
||||
|
||||
let branch =
|
||||
branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
|
||||
let branch = serde_json::to_vec(&branch)?;
|
||||
@@ -697,14 +654,12 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_removed"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"elapsed"),
|
||||
@@ -724,12 +679,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_relfiles_needed_as_tombstone
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
|
||||
Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
|
||||
@@ -746,12 +695,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_nonrelfiles_needed_as_tombstone
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
|
||||
Some(result.elapsed.as_millis().to_string().as_bytes()),
|
||||
|
||||
@@ -8,41 +8,14 @@
|
||||
|
||||
mod local_fs;
|
||||
mod rust_s3;
|
||||
/// A queue-based storage with the background machinery behind it to synchronize
|
||||
/// local page server layer files with external storage.
|
||||
mod synced_storage;
|
||||
/// A queue and the background machinery behind it to upload
|
||||
/// local page server layer files to external storage.
|
||||
pub mod storage_uploader;
|
||||
|
||||
use std::{path::Path, thread};
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
pub use self::synced_storage::schedule_timeline_upload;
|
||||
use self::{local_fs::LocalFs, rust_s3::RustS3};
|
||||
use crate::{PageServerConf, RelishStorageKind};
|
||||
|
||||
pub fn run_storage_sync_thread(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
match &config.relish_storage_config {
|
||||
Some(relish_storage_config) => {
|
||||
let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
|
||||
match &relish_storage_config.storage {
|
||||
RelishStorageKind::LocalFs(root) => synced_storage::run_storage_sync_thread(
|
||||
config,
|
||||
LocalFs::new(root.clone())?,
|
||||
max_concurrent_sync,
|
||||
),
|
||||
RelishStorageKind::AwsS3(s3_config) => synced_storage::run_storage_sync_thread(
|
||||
config,
|
||||
RustS3::new(s3_config)?,
|
||||
max_concurrent_sync,
|
||||
),
|
||||
}
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RelishStorage: Send + Sync {
|
||||
@@ -55,21 +28,15 @@ pub trait RelishStorage: Send + Sync {
|
||||
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;
|
||||
|
||||
async fn download_relish<W: 'static + std::io::Write + Send>(
|
||||
async fn download_relish(
|
||||
&self,
|
||||
from: &Self::RelishStoragePath,
|
||||
// rust_s3 `get_object_stream` method requires `std::io::BufWriter` for some reason, not the async counterpart
|
||||
// that forces us to consume and return the writer to satisfy the blocking operation async wrapper requirements
|
||||
to: std::io::BufWriter<W>,
|
||||
) -> anyhow::Result<std::io::BufWriter<W>>;
|
||||
to: &Path,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;
|
||||
|
||||
async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
|
||||
&self,
|
||||
from: &mut tokio::io::BufReader<R>,
|
||||
to: &Self::RelishStoragePath,
|
||||
) -> anyhow::Result<()>;
|
||||
async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
fn strip_workspace_prefix<'a>(
|
||||
|
||||
@@ -9,13 +9,11 @@
|
||||
|
||||
use std::{
|
||||
future::Future,
|
||||
io::Write,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::{fs, io};
|
||||
|
||||
use super::{strip_workspace_prefix, RelishStorage};
|
||||
|
||||
@@ -66,33 +64,16 @@ impl RelishStorage for LocalFs {
|
||||
Ok(get_all_files(&self.root).await?.into_iter().collect())
|
||||
}
|
||||
|
||||
async fn download_relish<W: 'static + std::io::Write + Send>(
|
||||
async fn download_relish(
|
||||
&self,
|
||||
from: &Self::RelishStoragePath,
|
||||
mut to: std::io::BufWriter<W>,
|
||||
) -> anyhow::Result<std::io::BufWriter<W>> {
|
||||
to: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
let updated_buffer = tokio::task::spawn_blocking(move || {
|
||||
let mut source = std::io::BufReader::new(
|
||||
std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&file_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
std::io::copy(&mut source, &mut to)
|
||||
.context("Failed to download the relish file")?;
|
||||
to.flush().context("Failed to flush the download buffer")?;
|
||||
Ok::<_, anyhow::Error>(to)
|
||||
})
|
||||
.await
|
||||
.context("Failed to spawn a blocking task")??;
|
||||
Ok(updated_buffer)
|
||||
create_target_directory(to).await?;
|
||||
tokio::fs::copy(file_path, to).await?;
|
||||
Ok(())
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
@@ -113,30 +94,18 @@ impl RelishStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish<R: io::AsyncRead + std::marker::Unpin + Send>(
|
||||
&self,
|
||||
from: &mut io::BufReader<R>,
|
||||
to: &Self::RelishStoragePath,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let target_file_path = self.resolve_in_storage(to)?;
|
||||
create_target_directory(&target_file_path).await?;
|
||||
let mut destination = io::BufWriter::new(
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.open(&target_file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open target fs destination at '{}'",
|
||||
target_file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
|
||||
io::copy_buf(from, &mut destination)
|
||||
tokio::fs::copy(&from, &target_file_path)
|
||||
.await
|
||||
.context("Failed to upload relish to local storage")?;
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload relish '{}' to local storage",
|
||||
from.display(),
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
//! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage.
|
||||
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
use s3::{bucket::Bucket, creds::Credentials, region::Region};
|
||||
|
||||
use crate::{
|
||||
relish_storage::{strip_workspace_prefix, RelishStorage},
|
||||
S3Config,
|
||||
};
|
||||
use crate::{relish_storage::strip_workspace_prefix, S3Config};
|
||||
|
||||
use super::RelishStorage;
|
||||
|
||||
const S3_FILE_SEPARATOR: char = '/';
|
||||
|
||||
@@ -84,14 +82,18 @@ impl RelishStorage for RustS3 {
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn download_relish<W: 'static + std::io::Write + Send>(
|
||||
async fn download_relish(
|
||||
&self,
|
||||
from: &Self::RelishStoragePath,
|
||||
mut to: std::io::BufWriter<W>,
|
||||
) -> anyhow::Result<std::io::BufWriter<W>> {
|
||||
to: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut target_file = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.open(to)
|
||||
.with_context(|| format!("Failed to open target s3 destination at {}", to.display()))?;
|
||||
let code = self
|
||||
.bucket
|
||||
.get_object_stream(from.key(), &mut to)
|
||||
.get_object_stream(from.key(), &mut target_file)
|
||||
.await
|
||||
.with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
|
||||
if code != 200 {
|
||||
@@ -100,12 +102,7 @@ impl RelishStorage for RustS3 {
|
||||
code
|
||||
))
|
||||
} else {
|
||||
tokio::task::spawn_blocking(move || {
|
||||
to.flush().context("Failed to fluch the downoad buffer")?;
|
||||
Ok::<_, anyhow::Error>(to)
|
||||
})
|
||||
.await
|
||||
.context("Failed to joim the download buffer flush task")?
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,14 +123,12 @@ impl RelishStorage for RustS3 {
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
|
||||
&self,
|
||||
from: &mut tokio::io::BufReader<R>,
|
||||
to: &Self::RelishStoragePath,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let mut local_file = tokio::fs::OpenOptions::new().read(true).open(from).await?;
|
||||
|
||||
let code = self
|
||||
.bucket
|
||||
.put_object_stream(from, to.key())
|
||||
.put_object_stream(&mut local_file, to.key())
|
||||
.await
|
||||
.with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
|
||||
if code != 200 {
|
||||
|
||||
116
pageserver/src/relish_storage/storage_uploader.rs
Normal file
116
pageserver/src/relish_storage/storage_uploader.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
path::{Path, PathBuf},
|
||||
sync::{Arc, Mutex},
|
||||
thread,
|
||||
};
|
||||
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use crate::{relish_storage::RelishStorage, RelishStorageConfig};
|
||||
|
||||
use super::{local_fs::LocalFs, rust_s3::RustS3};
|
||||
|
||||
pub struct QueueBasedRelishUploader {
|
||||
upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
|
||||
}
|
||||
|
||||
impl QueueBasedRelishUploader {
|
||||
pub fn new(
|
||||
config: &RelishStorageConfig,
|
||||
page_server_workdir: &'static Path,
|
||||
) -> anyhow::Result<Self> {
|
||||
let upload_queue = Arc::new(Mutex::new(VecDeque::new()));
|
||||
let _handle = match config {
|
||||
RelishStorageConfig::LocalFs(root) => {
|
||||
let relish_storage = LocalFs::new(root.clone())?;
|
||||
create_upload_thread(
|
||||
Arc::clone(&upload_queue),
|
||||
relish_storage,
|
||||
page_server_workdir,
|
||||
)?
|
||||
}
|
||||
RelishStorageConfig::AwsS3(s3_config) => {
|
||||
let relish_storage = RustS3::new(s3_config)?;
|
||||
create_upload_thread(
|
||||
Arc::clone(&upload_queue),
|
||||
relish_storage,
|
||||
page_server_workdir,
|
||||
)?
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self { upload_queue })
|
||||
}
|
||||
|
||||
pub fn schedule_upload(&self, timeline_id: ZTimelineId, relish_path: PathBuf) {
|
||||
self.upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push_back((timeline_id, relish_path))
|
||||
}
|
||||
}
|
||||
|
||||
fn create_upload_thread<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
|
||||
upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
|
||||
relish_storage: S,
|
||||
page_server_workdir: &'static Path,
|
||||
) -> std::io::Result<thread::JoinHandle<()>> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
thread::Builder::new()
|
||||
.name("Queue based relish uploader".to_string())
|
||||
.spawn(move || loop {
|
||||
runtime.block_on(async {
|
||||
upload_loop_step(&upload_queue, &relish_storage, page_server_workdir).await;
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload_loop_step<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
|
||||
upload_queue: &Mutex<VecDeque<(ZTimelineId, PathBuf)>>,
|
||||
relish_storage: &S,
|
||||
page_server_workdir: &Path,
|
||||
) {
|
||||
let mut queue_accessor = upload_queue.lock().unwrap();
|
||||
log::debug!("current upload queue length: {}", queue_accessor.len());
|
||||
let next_upload = queue_accessor.pop_front();
|
||||
drop(queue_accessor);
|
||||
|
||||
let (relish_timeline_id, relish_local_path) = match next_upload {
|
||||
Some(data) => data,
|
||||
None => {
|
||||
// Don't spin and allow others to use the queue.
|
||||
// In future, could be improved to be more clever about delays depending on relish upload stats
|
||||
thread::sleep(std::time::Duration::from_secs(1));
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = upload_relish(relish_storage, page_server_workdir, &relish_local_path).await {
|
||||
log::error!(
|
||||
"Failed to upload relish '{}' for timeline {}, reason: {}",
|
||||
relish_local_path.display(),
|
||||
relish_timeline_id,
|
||||
e
|
||||
);
|
||||
upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push_back((relish_timeline_id, relish_local_path))
|
||||
} else {
|
||||
log::debug!("Relish successfully uploaded");
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish<P, S: RelishStorage<RelishStoragePath = P>>(
|
||||
relish_storage: &S,
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let destination = S::derive_destination(page_server_workdir, relish_local_path)?;
|
||||
relish_storage
|
||||
.upload_relish(relish_local_path, &destination)
|
||||
.await
|
||||
}
|
||||
@@ -1,57 +0,0 @@
|
||||
use std::time::Duration;
|
||||
use std::{collections::BinaryHeap, sync::Mutex, thread};
|
||||
|
||||
use crate::tenant_mgr;
|
||||
use crate::{relish_storage::RelishStorage, PageServerConf};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref UPLOAD_QUEUE: Mutex<BinaryHeap<SyncTask>> = Mutex::new(BinaryHeap::new());
|
||||
}
|
||||
|
||||
pub fn schedule_timeline_upload(_local_timeline: ()) {
|
||||
// UPLOAD_QUEUE
|
||||
// .lock()
|
||||
// .unwrap()
|
||||
// .push(SyncTask::Upload(local_timeline))
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum SyncTask {}
|
||||
|
||||
pub fn run_storage_sync_thread<
|
||||
P: std::fmt::Debug,
|
||||
S: 'static + RelishStorage<RelishStoragePath = P>,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
relish_storage: S,
|
||||
max_concurrent_sync: usize,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name("Queue based relish storage sync".to_string())
|
||||
.spawn(move || {
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
|
||||
log::debug!("Upload queue length: {}", queue_accessor.len());
|
||||
let next_task = queue_accessor.pop();
|
||||
drop(queue_accessor);
|
||||
match next_task {
|
||||
Some(task) => runtime.block_on(async {
|
||||
// suppress warnings
|
||||
let _ = (config, task, &relish_storage, max_concurrent_sync);
|
||||
todo!("omitted for brevity")
|
||||
}),
|
||||
None => {
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
log::debug!("Queue based relish storage sync thread shut down");
|
||||
Ok(())
|
||||
})?;
|
||||
Ok(Some(handle))
|
||||
}
|
||||
@@ -3,7 +3,7 @@ use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::ops::{AddAssign, Deref};
|
||||
use std::ops::AddAssign;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::lsn::{Lsn, RecordLsn};
|
||||
@@ -13,8 +13,6 @@ use zenith_utils::zid::ZTimelineId;
|
||||
/// A repository corresponds to one .zenith directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
pub trait Repository: Send + Sync {
|
||||
fn shutdown(&self) -> Result<()>;
|
||||
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
@@ -30,14 +28,18 @@ pub trait Repository: Send + Sync {
|
||||
///
|
||||
/// 'timelineid' specifies the timeline to GC, or None for all.
|
||||
/// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
|
||||
/// `checkpoint_before_gc` parameter is used to force compaction of storage before CG
|
||||
/// to make tests more deterministic.
|
||||
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
|
||||
/// `compact` parameter is used to force compaction of storage.
|
||||
/// some storage implementation are based on lsm tree and require periodic merge (compaction).
|
||||
/// usually storage implementation determines itself when compaction should be performed.
|
||||
/// but for gc tests it way be useful to force compaction just after completion of gc iteration
|
||||
/// to make sure that all detected garbage is removed.
|
||||
/// so right now `compact` is set to true when gc explicitly requested through page srver api,
|
||||
/// and is st to false in gc threads which infinitely repeats gc iterations in loop.
|
||||
fn gc_iteration(
|
||||
&self,
|
||||
timelineid: Option<ZTimelineId>,
|
||||
horizon: u64,
|
||||
checkpoint_before_gc: bool,
|
||||
compact: bool,
|
||||
) -> Result<GcResult>;
|
||||
}
|
||||
|
||||
@@ -50,7 +52,6 @@ pub struct GcResult {
|
||||
pub ondisk_relfiles_needed_by_cutoff: u64,
|
||||
pub ondisk_relfiles_needed_by_branches: u64,
|
||||
pub ondisk_relfiles_not_updated: u64,
|
||||
pub ondisk_relfiles_needed_as_tombstone: u64,
|
||||
pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped
|
||||
|
||||
@@ -58,7 +59,6 @@ pub struct GcResult {
|
||||
pub ondisk_nonrelfiles_needed_by_cutoff: u64,
|
||||
pub ondisk_nonrelfiles_needed_by_branches: u64,
|
||||
pub ondisk_nonrelfiles_not_updated: u64,
|
||||
pub ondisk_nonrelfiles_needed_as_tombstone: u64,
|
||||
pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped
|
||||
|
||||
@@ -71,7 +71,6 @@ impl AddAssign for GcResult {
|
||||
self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff;
|
||||
self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches;
|
||||
self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated;
|
||||
self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone;
|
||||
self.ondisk_relfiles_removed += other.ondisk_relfiles_removed;
|
||||
self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped;
|
||||
|
||||
@@ -79,7 +78,6 @@ impl AddAssign for GcResult {
|
||||
self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff;
|
||||
self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches;
|
||||
self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated;
|
||||
self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone;
|
||||
self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed;
|
||||
self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped;
|
||||
|
||||
@@ -125,6 +123,26 @@ pub trait Timeline: Send + Sync {
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>;
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records
|
||||
fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Track end of the latest digested WAL record.
|
||||
///
|
||||
/// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
|
||||
/// Previous last record LSN is stored alongside the latest and can be read.
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn);
|
||||
/// Atomically get both last and prev.
|
||||
fn get_last_record_rlsn(&self) -> RecordLsn;
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
@@ -132,9 +150,6 @@ pub trait Timeline: Send + Sync {
|
||||
fn get_prev_record_lsn(&self) -> Lsn;
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
/// Mutate the timeline with a [`TimelineWriter`].
|
||||
fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
|
||||
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
@@ -153,35 +168,9 @@ pub trait Timeline: Send + Sync {
|
||||
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
|
||||
}
|
||||
|
||||
/// Various functions to mutate the timeline.
|
||||
// TODO Currently, Deref is used to allow easy access to read methods from this trait.
|
||||
// This is probably considered a bad practice in Rust and should be fixed eventually,
|
||||
// but will cause large code changes.
|
||||
pub trait TimelineWriter: Deref<Target = dyn Timeline> {
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, lsn: Lsn, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>;
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records
|
||||
fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Track end of the latest digested WAL record.
|
||||
///
|
||||
/// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
|
||||
/// Previous last record LSN is stored alongside the latest and can be read.
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn);
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
pub rec: Bytes,
|
||||
// Remember the offset of main_data in rec,
|
||||
@@ -192,19 +181,22 @@ pub struct WALRecord {
|
||||
|
||||
impl WALRecord {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u64(self.lsn.0);
|
||||
buf.put_u8(self.will_init as u8);
|
||||
buf.put_u32(self.main_data_offset);
|
||||
buf.put_u32(self.rec.len() as u32);
|
||||
buf.put_slice(&self.rec[..]);
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> WALRecord {
|
||||
let lsn = Lsn::from(buf.get_u64());
|
||||
let will_init = buf.get_u8() != 0;
|
||||
let main_data_offset = buf.get_u32();
|
||||
let rec_len = buf.get_u32() as usize;
|
||||
let rec = buf.split_to(rec_len);
|
||||
let mut dst = vec![0u8; buf.get_u32() as usize];
|
||||
buf.copy_to_slice(&mut dst);
|
||||
WALRecord {
|
||||
lsn,
|
||||
will_init,
|
||||
rec,
|
||||
rec: Bytes::from(dst),
|
||||
main_data_offset,
|
||||
}
|
||||
}
|
||||
@@ -217,21 +209,15 @@ impl WALRecord {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::layered_repository::{LayeredRepository, METADATA_FILE_NAME};
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use hex_literal::hex;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
const TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
const NEW_TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
@@ -267,62 +253,47 @@ mod tests {
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
||||
|
||||
struct RepoHarness {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
}
|
||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
|
||||
impl RepoHarness {
|
||||
fn create(test_name: &'static str) -> Result<Self> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
let tenantid = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
let repo = Box::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
|
||||
Ok(Self { conf, tenant_id })
|
||||
}
|
||||
|
||||
fn load(&self) -> Box<dyn Repository> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
Box::new(LayeredRepository::new(
|
||||
self.conf,
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
false,
|
||||
))
|
||||
}
|
||||
|
||||
fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
|
||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
||||
}
|
||||
Ok(repo)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_relsize")?.load();
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
|
||||
writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
tline.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
|
||||
tline.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
|
||||
|
||||
writer.advance_last_record_lsn(Lsn(0x50));
|
||||
tline.advance_last_record_lsn(Lsn(0x50));
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(0x50));
|
||||
|
||||
@@ -368,8 +339,8 @@ mod tests {
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x60));
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x60));
|
||||
assert_current_logical_size(&tline, Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
@@ -391,13 +362,13 @@ mod tests {
|
||||
);
|
||||
|
||||
// Truncate to zero length
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x68));
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x68));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0);
|
||||
|
||||
// Extend from 0 to 2 blocks, leaving a gap
|
||||
writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x70));
|
||||
tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x70));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE);
|
||||
assert_eq!(
|
||||
@@ -424,159 +395,21 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Test what happens if we dropped a relation
|
||||
// and then created it again within the same layer.
|
||||
#[test]
|
||||
fn test_drop_extend() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_drop_extend")?.load();
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x20));
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1);
|
||||
|
||||
// Drop relish
|
||||
writer.drop_relish(TESTREL_A, Lsn(0x30))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x30));
|
||||
|
||||
// Check that rel is not visible anymore
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
|
||||
assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none());
|
||||
|
||||
// Extend it again
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x40))?.unwrap(), 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Test what happens if we truncated a relation
|
||||
// so that one of its segments was dropped
|
||||
// and then extended it again within the same layer.
|
||||
#[test]
|
||||
fn test_truncate_extend() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_truncate_extend")?.load();
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
//from storage_layer.rs
|
||||
const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
|
||||
let relsize = RELISH_SEG_SIZE * 2;
|
||||
|
||||
// Create relation with relsize blocks
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
}
|
||||
|
||||
writer.advance_last_record_lsn(Lsn(0x20));
|
||||
|
||||
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
|
||||
assert!(tline.get_relish_size(TESTREL_A, Lsn(0x10))?.is_none());
|
||||
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(),
|
||||
relsize
|
||||
);
|
||||
|
||||
// Check relation content
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, blkno, lsn)?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
|
||||
// Truncate relation so that second segment was dropped
|
||||
// - only leave one page
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1);
|
||||
|
||||
for blkno in 0..1 {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
|
||||
// should still see all blocks with older LSN
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(),
|
||||
relsize
|
||||
);
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
|
||||
// Extend relation again.
|
||||
// Add enough blocks to create second segment
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x80);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
}
|
||||
writer.advance_last_record_lsn(Lsn(0x80));
|
||||
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(),
|
||||
relsize
|
||||
);
|
||||
// Check relation content
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x80);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_large_rel")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
let mut lsn = 0x10;
|
||||
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||
lsn += 0x10;
|
||||
writer.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
|
||||
tline.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
|
||||
}
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
|
||||
@@ -587,8 +420,8 @@ mod tests {
|
||||
|
||||
// Truncate one block
|
||||
lsn += 0x10;
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
pg_constants::RELSEG_SIZE
|
||||
@@ -597,8 +430,8 @@ mod tests {
|
||||
|
||||
// Truncate another block
|
||||
lsn += 0x10;
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
pg_constants::RELSEG_SIZE - 1
|
||||
@@ -610,8 +443,8 @@ mod tests {
|
||||
let mut size: i32 = 3000;
|
||||
while size >= 0 {
|
||||
lsn += 0x10;
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
size as u32
|
||||
@@ -628,20 +461,23 @@ mod tests {
|
||||
/// Test list_rels() function, with branches and dropped relations
|
||||
///
|
||||
#[test]
|
||||
// FIXME: The last assertion in this test is currently failing, see
|
||||
// https://github.com/zenithdb/zenith/issues/502. Ignore the failure until that's fixed.
|
||||
#[ignore]
|
||||
fn test_list_rels_drop() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_list_rels_drop")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
let repo = get_test_repo("test_list_rels_drop")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
const TESTDB: u32 = 111;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
|
||||
writer.advance_last_record_lsn(Lsn(0x30));
|
||||
tline.advance_last_record_lsn(Lsn(0x30));
|
||||
|
||||
// Check that list_rels() lists it after LSN 2, but no before it
|
||||
assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A));
|
||||
@@ -649,19 +485,17 @@ mod tests {
|
||||
assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
|
||||
|
||||
// Create a branch, check that the relation is visible there
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let new_writer = newtline.writer();
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
assert!(newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x30))?
|
||||
.contains(&TESTREL_A));
|
||||
|
||||
// Drop it on the branch
|
||||
new_writer.drop_relish(TESTREL_A, Lsn(0x40))?;
|
||||
new_writer.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
drop(new_writer);
|
||||
newtline.drop_relish(TESTREL_A, Lsn(0x40))?;
|
||||
newtline.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
// Check that it's no longer listed on the branch after the point where it was dropped
|
||||
assert!(newtline
|
||||
@@ -673,8 +507,9 @@ mod tests {
|
||||
|
||||
// Run checkpoint and garbage collection and check that it's still not visible
|
||||
newtline.checkpoint()?;
|
||||
repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;
|
||||
repo.gc_iteration(Some(newtimelineid), 0, true)?;
|
||||
|
||||
// FIXME: this is currently failing
|
||||
assert!(!newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x40))?
|
||||
.contains(&TESTREL_A));
|
||||
@@ -687,32 +522,32 @@ mod tests {
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_branch")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
let repo = get_test_repo("test_branch")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
|
||||
// Create another relation
|
||||
writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
|
||||
|
||||
writer.advance_last_record_lsn(Lsn(0x40));
|
||||
tline.advance_last_record_lsn(Lsn(0x40));
|
||||
assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let new_writer = newtline.writer();
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
new_writer.advance_last_record_lsn(Lsn(0x40));
|
||||
newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
newtline.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
// Check page contents on both branches
|
||||
assert_eq!(
|
||||
@@ -737,89 +572,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_metadata() -> Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
|
||||
|
||||
assert!(metadata_path.is_file());
|
||||
|
||||
let mut metadata_bytes = std::fs::read(&metadata_path)?;
|
||||
assert_eq!(metadata_bytes.len(), 512);
|
||||
metadata_bytes[512 - 4 - 2] ^= 1;
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap();
|
||||
assert!(err.to_string().contains("checksum"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn future_layerfiles() -> Result<()> {
|
||||
const TEST_NAME: &str = "future_layerfiles";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let make_empty_file = |filename: &str| -> std::io::Result<()> {
|
||||
let path = timeline_path.join(filename);
|
||||
|
||||
assert!(!path.exists());
|
||||
std::fs::write(&path, &[])?;
|
||||
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let image_filename = format!("pg_control_0_{:016X}", 8000);
|
||||
let delta_filename = format!("pg_control_0_{:016X}_{:016X}", 8000, 8008);
|
||||
|
||||
make_empty_file(&image_filename)?;
|
||||
make_empty_file(&delta_filename)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
let check_old = |filename: &str, num: u32| {
|
||||
let path = timeline_path.join(filename);
|
||||
assert!(!path.exists());
|
||||
|
||||
let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
|
||||
assert!(backup_path.exists());
|
||||
};
|
||||
|
||||
check_old(&image_filename, 0);
|
||||
check_old(&delta_filename, 0);
|
||||
|
||||
make_empty_file(&image_filename)?;
|
||||
make_empty_file(&delta_filename)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
check_old(&image_filename, 0);
|
||||
check_old(&delta_filename, 0);
|
||||
check_old(&image_filename, 1);
|
||||
check_old(&delta_filename, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager;
|
||||
struct TestRedoManager {}
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
@@ -828,7 +582,7 @@ mod tests {
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {} blk {} to get to {}, with {} and {} records",
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! zenith Timeline.
|
||||
//!
|
||||
use log::*;
|
||||
use postgres_ffi::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use std::cmp::min;
|
||||
@@ -10,9 +11,8 @@ use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes};
|
||||
use tracing::*;
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::*;
|
||||
@@ -34,7 +34,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
///
|
||||
pub fn import_timeline_from_postgres_datadir(
|
||||
path: &Path,
|
||||
writer: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
// Scan 'global'
|
||||
@@ -44,10 +44,10 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
None => continue,
|
||||
|
||||
Some("pg_control") => {
|
||||
import_control_file(writer, lsn, &direntry.path())?;
|
||||
import_control_file(timeline, lsn, &direntry.path())?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
writer,
|
||||
timeline,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
@@ -59,7 +59,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
writer,
|
||||
timeline,
|
||||
lsn,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
@@ -86,7 +86,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
writer,
|
||||
timeline,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
@@ -98,7 +98,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
writer,
|
||||
timeline,
|
||||
lsn,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
@@ -108,24 +108,24 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
|
||||
import_slru_file(timeline, lsn, SlruKind::Clog, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
import_slru_file(timeline, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
import_slru_file(timeline, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
|
||||
import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -133,13 +133,12 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_relfile(
|
||||
path: &Path,
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
) -> Result<()> {
|
||||
// Does it look like a relation file?
|
||||
trace!("importing rel file {}", path.display());
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
if let Err(e) = p {
|
||||
@@ -167,14 +166,15 @@ fn import_relfile(
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(err) => match err.kind() {
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
bail!("error reading file {}: {:#}", path.display(), err);
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -191,7 +191,7 @@ fn import_relfile(
|
||||
/// are just slurped into the repository as one blob.
|
||||
///
|
||||
fn import_nonrel_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
tag: RelishTag,
|
||||
path: &Path,
|
||||
@@ -201,7 +201,7 @@ fn import_nonrel_file(
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
trace!("importing non-rel file {}", path.display());
|
||||
info!("importing non-rel file {}", path.display());
|
||||
|
||||
timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
@@ -212,13 +212,13 @@ fn import_nonrel_file(
|
||||
///
|
||||
/// The control file is imported as is, but we also extract the checkpoint record
|
||||
/// from it and store it separated.
|
||||
fn import_control_file(timeline: &dyn TimelineWriter, lsn: Lsn, path: &Path) -> Result<()> {
|
||||
fn import_control_file(timeline: &dyn Timeline, lsn: Lsn, path: &Path) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
trace!("importing control file {}", path.display());
|
||||
info!("importing control file {}", path.display());
|
||||
|
||||
// Import it as ControlFile
|
||||
timeline.put_page_image(
|
||||
@@ -239,18 +239,13 @@ fn import_control_file(timeline: &dyn TimelineWriter, lsn: Lsn, path: &Path) ->
|
||||
///
|
||||
/// Import an SLRU segment file
|
||||
///
|
||||
fn import_slru_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
slru: SlruKind,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Path) -> Result<()> {
|
||||
// Does it look like an SLRU file?
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
|
||||
trace!("importing slru file {}", path.display());
|
||||
info!("importing slru file {}", path.display());
|
||||
|
||||
let mut rpageno = 0;
|
||||
loop {
|
||||
@@ -266,14 +261,15 @@ fn import_slru_file(
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(err) => match err.kind() {
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
bail!("error reading file {}: {:#}", path.display(), err);
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -291,15 +287,12 @@ fn import_slru_file(
|
||||
///
|
||||
pub fn save_decoded_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
decoded: &DecodedWALRecord,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
if checkpoint.update_next_xid(decoded.xl_xid) {
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
checkpoint.update_next_xid(decoded.xl_xid);
|
||||
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
@@ -312,12 +305,13 @@ pub fn save_decoded_record(
|
||||
});
|
||||
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
|
||||
timeline.put_wal_record(tag, blk.blkno, rec)?;
|
||||
}
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
@@ -382,7 +376,7 @@ pub fn save_decoded_record(
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
let xlrec = XlClogTruncate::decode(&mut buf);
|
||||
save_clog_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
|
||||
save_clog_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
@@ -451,17 +445,10 @@ pub fn save_decoded_record(
|
||||
)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
save_multixact_create_record(
|
||||
checkpoint,
|
||||
checkpoint_modified,
|
||||
timeline,
|
||||
lsn,
|
||||
&xlrec,
|
||||
decoded,
|
||||
)?;
|
||||
save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||
save_multixact_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
|
||||
save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
@@ -470,10 +457,7 @@ pub fn save_decoded_record(
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
if checkpoint.nextOid != next_oid {
|
||||
checkpoint.nextOid = next_oid;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
checkpoint.nextOid = next_oid;
|
||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
@@ -489,7 +473,6 @@ pub fn save_decoded_record(
|
||||
);
|
||||
if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
|
||||
checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -497,11 +480,7 @@ pub fn save_decoded_record(
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record.
|
||||
fn save_xlog_dbase_create(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
rec: &XlCreateDatabase,
|
||||
) -> Result<()> {
|
||||
fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> {
|
||||
let db_id = rec.db_id;
|
||||
let tablespace_id = rec.tablespace_id;
|
||||
let src_db_id = rec.src_db_id;
|
||||
@@ -578,11 +557,7 @@ fn save_xlog_dbase_create(
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record.
|
||||
///
|
||||
/// This is the same logic as in PostgreSQL's smgr_redo() function.
|
||||
fn save_xlog_smgr_truncate(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
rec: &XlSmgrTruncate,
|
||||
) -> Result<()> {
|
||||
fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> {
|
||||
let spcnode = rec.rnode.spcnode;
|
||||
let dbnode = rec.rnode.dbnode;
|
||||
let relnode = rec.rnode.relnode;
|
||||
@@ -644,7 +619,7 @@ fn save_xlog_smgr_truncate(
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
|
||||
///
|
||||
fn save_xact_record(
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
parsed: &XlXactParsedRecord,
|
||||
decoded: &DecodedWALRecord,
|
||||
@@ -655,12 +630,12 @@ fn save_xact_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
@@ -676,7 +651,6 @@ fn save_xact_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
@@ -702,8 +676,7 @@ fn save_xact_record(
|
||||
|
||||
fn save_clog_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlClogTruncate,
|
||||
) -> Result<()> {
|
||||
@@ -721,7 +694,6 @@ fn save_clog_truncate_record(
|
||||
// TODO Figure out if there will be any issues with replica.
|
||||
checkpoint.oldestXid = xlrec.oldest_xid;
|
||||
checkpoint.oldestXidDB = xlrec.oldest_xid_db;
|
||||
*checkpoint_modified = true;
|
||||
|
||||
// TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it
|
||||
|
||||
@@ -764,13 +736,13 @@ fn save_clog_truncate_record(
|
||||
|
||||
fn save_multixact_create_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
@@ -779,7 +751,6 @@ fn save_multixact_create_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
@@ -799,7 +770,6 @@ fn save_multixact_create_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
@@ -822,11 +792,9 @@ fn save_multixact_create_record(
|
||||
}
|
||||
if xlrec.mid >= checkpoint.nextMulti {
|
||||
checkpoint.nextMulti = xlrec.mid + 1;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
|
||||
checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
|
||||
if mbr.xid.wrapping_sub(acc) as i32 > 0 {
|
||||
@@ -836,22 +804,18 @@ fn save_multixact_create_record(
|
||||
}
|
||||
});
|
||||
|
||||
if checkpoint.update_next_xid(max_mbr_xid) {
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
checkpoint.update_next_xid(max_mbr_xid);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
) -> Result<()> {
|
||||
checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
*checkpoint_modified = true;
|
||||
|
||||
// PerformMembersTruncation
|
||||
let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
|
||||
@@ -885,7 +849,7 @@ fn save_multixact_truncate_record(
|
||||
}
|
||||
|
||||
fn save_relmap_page(
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlRelmapUpdate,
|
||||
decoded: &DecodedWALRecord,
|
||||
|
||||
@@ -8,139 +8,47 @@ use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::{debug, info};
|
||||
use std::collections::hash_map::Entry;
|
||||
use log::info;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::thread::JoinHandle;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
lazy_static! {
|
||||
static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
|
||||
pub static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
fn access_repository() -> MutexGuard<'static, HashMap<ZTenantId, Arc<dyn Repository>>> {
|
||||
REPOSITORY.lock().unwrap()
|
||||
}
|
||||
struct TenantHandleEntry {
|
||||
checkpointer_handle: Option<JoinHandle<()>>,
|
||||
gc_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
// Logically these handles belong to Repository,
|
||||
// but it's just simpler to store them separately
|
||||
lazy_static! {
|
||||
static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = access_repository();
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
|
||||
let tenantid =
|
||||
ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
|
||||
let repo = init_repo(conf, tenantid);
|
||||
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
LayeredRepository::launch_gc_thread(conf, repo.clone());
|
||||
|
||||
info!("initialized storage for tenant: {}", &tenantid);
|
||||
m.insert(tenantid, repo);
|
||||
}
|
||||
}
|
||||
|
||||
fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Arc<LayeredRepository> {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
true,
|
||||
));
|
||||
|
||||
let checkpointer_handle = LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
let gc_handle = LayeredRepository::launch_gc_thread(conf, repo.clone());
|
||||
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
let h = TenantHandleEntry {
|
||||
checkpointer_handle: Some(checkpointer_handle),
|
||||
gc_handle: Some(gc_handle),
|
||||
};
|
||||
|
||||
handles.insert(tenant_id, h);
|
||||
|
||||
repo
|
||||
}
|
||||
|
||||
// TODO kb Currently unused function, will later be used when the relish storage downloads a new layer.
|
||||
// Relevant PR: https://github.com/zenithdb/zenith/pull/686
|
||||
pub fn register_relish_download(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
) {
|
||||
log::info!(
|
||||
"Registering new download, tenant id {}, timeline id: {}",
|
||||
tenant_id,
|
||||
timeline_id
|
||||
);
|
||||
match access_repository().entry(tenant_id) {
|
||||
Entry::Occupied(o) => init_timeline(o.get().as_ref(), timeline_id),
|
||||
Entry::Vacant(v) => {
|
||||
log::info!("New repo initialized");
|
||||
let new_repo = init_repo(conf, tenant_id);
|
||||
init_timeline(new_repo.as_ref(), timeline_id);
|
||||
v.insert(new_repo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
|
||||
match repo.get_timeline(timeline_id) {
|
||||
Ok(_timeline) => log::info!("Successfully initialized timeline {}", timeline_id),
|
||||
Err(e) => log::error!("Failed to init timeline {}, reason: {:#}", timeline_id, e),
|
||||
}
|
||||
}
|
||||
|
||||
// Check this flag in the thread loops to know when to exit
|
||||
pub fn shutdown_requested() -> bool {
|
||||
SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn stop_tenant_threads(tenantid: ZTenantId) {
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
if let Some(h) = handles.get_mut(&tenantid) {
|
||||
h.checkpointer_handle.take().map(JoinHandle::join);
|
||||
debug!("checkpointer for tenant {} has stopped", tenantid);
|
||||
h.gc_handle.take().map(JoinHandle::join);
|
||||
debug!("gc for tenant {} has stopped", tenantid);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shutdown_all_tenants() -> Result<()> {
|
||||
SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
|
||||
|
||||
let tenants = list_tenants()?;
|
||||
for tenantid in tenants {
|
||||
stop_tenant_threads(tenantid);
|
||||
let repo = get_repository_for_tenant(tenantid)?;
|
||||
debug!("shutdown tenant {}", tenantid);
|
||||
repo.shutdown()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn create_repository_for_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<()> {
|
||||
let mut m = access_repository();
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
// First check that the tenant doesn't exist already
|
||||
if m.get(&tenantid).is_some() {
|
||||
@@ -154,10 +62,15 @@ pub fn create_repository_for_tenant(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
|
||||
let o = &mut REPOSITORY.lock().unwrap();
|
||||
o.insert(tenantid, repo);
|
||||
}
|
||||
|
||||
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
|
||||
access_repository()
|
||||
.get(&tenantid)
|
||||
.map(Arc::clone)
|
||||
let o = &REPOSITORY.lock().unwrap();
|
||||
o.get(&tenantid)
|
||||
.map(|repo| Arc::clone(repo))
|
||||
.ok_or_else(|| anyhow!("repository not found for tenant name {}", tenantid))
|
||||
}
|
||||
|
||||
@@ -169,14 +82,3 @@ pub fn get_timeline_for_tenant(
|
||||
.get_timeline(timelineid)
|
||||
.with_context(|| format!("cannot fetch timeline {}", timelineid))
|
||||
}
|
||||
|
||||
fn list_tenants() -> Result<Vec<ZTenantId>> {
|
||||
let o = &mut REPOSITORY.lock().unwrap();
|
||||
|
||||
o.iter()
|
||||
.map(|tenant| {
|
||||
let (tenantid, _) = tenant;
|
||||
Ok(*tenantid)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -72,10 +72,6 @@ impl WalStreamDecoder {
|
||||
/// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
|
||||
///
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
|
||||
let recordbuf;
|
||||
|
||||
// Run state machine that validates page headers, and reassembles records
|
||||
// that cross page boundaries.
|
||||
loop {
|
||||
// parse and verify page boundaries as we go
|
||||
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
||||
@@ -124,41 +120,29 @@ impl WalStreamDecoder {
|
||||
self.lsn += self.padlen as u64;
|
||||
self.padlen = 0;
|
||||
} else if self.contlen == 0 {
|
||||
assert!(self.recordbuf.is_empty());
|
||||
|
||||
// need to have at least the xl_tot_len field
|
||||
|
||||
if self.inputbuf.remaining() < 4 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// peek xl_tot_len at the beginning of the record.
|
||||
// FIXME: assumes little-endian
|
||||
// read xl_tot_len FIXME: assumes little-endian
|
||||
self.startlsn = self.lsn;
|
||||
let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
|
||||
let xl_tot_len = self.inputbuf.get_u32_le();
|
||||
if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
|
||||
return Err(WalDecodeError {
|
||||
msg: format!("invalid xl_tot_len {}", xl_tot_len),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
self.lsn += 4;
|
||||
|
||||
// Fast path for the common case that the whole record fits on the page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
|
||||
// Take the record from the 'inputbuf', and validate it.
|
||||
recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
|
||||
self.lsn += xl_tot_len as u64;
|
||||
break;
|
||||
} else {
|
||||
// Need to assemble the record from pieces. Remember the size of the
|
||||
// record, and loop back. On next iteration, we will reach the 'else'
|
||||
// branch below, and copy the part of the record that was on this page
|
||||
// to 'recordbuf'. Subsequent iterations will skip page headers, and
|
||||
// append the continuations from the next pages to 'recordbuf'.
|
||||
self.recordbuf.reserve(xl_tot_len as usize);
|
||||
self.contlen = xl_tot_len;
|
||||
continue;
|
||||
}
|
||||
self.recordbuf.clear();
|
||||
self.recordbuf.reserve(xl_tot_len as usize);
|
||||
self.recordbuf.put_u32_le(xl_tot_len);
|
||||
|
||||
self.contlen = xl_tot_len - 4;
|
||||
continue;
|
||||
} else {
|
||||
// we're continuing a record, possibly from previous page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
@@ -175,42 +159,47 @@ impl WalStreamDecoder {
|
||||
self.contlen -= n as u32;
|
||||
|
||||
if self.contlen == 0 {
|
||||
// The record is now complete.
|
||||
recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
|
||||
break;
|
||||
let recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new());
|
||||
|
||||
let recordbuf = recordbuf.freeze();
|
||||
let mut buf = recordbuf.clone();
|
||||
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.padlen =
|
||||
self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
}
|
||||
|
||||
let mut crc = crc32c_append(0, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
|
||||
crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
|
||||
if crc != xlogrec.xl_crc {
|
||||
return Err(WalDecodeError {
|
||||
msg: "WAL record crc mismatch".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
|
||||
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
|
||||
// and WalReceiver integration. Since this code is used both for WalReceiver and
|
||||
// initial WAL import let's force alignment right here.
|
||||
let result = (self.lsn.align(), recordbuf);
|
||||
return Ok(Some(result));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// check record boundaries
|
||||
|
||||
// We now have a record in the 'recordbuf' local variable.
|
||||
let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
|
||||
// deal with continuation records
|
||||
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
|
||||
crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
|
||||
if crc != xlogrec.xl_crc {
|
||||
return Err(WalDecodeError {
|
||||
msg: "WAL record crc mismatch".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
}
|
||||
|
||||
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
|
||||
// and WalReceiver integration. Since this code is used both for WalReceiver and
|
||||
// initial WAL import let's force alignment right here.
|
||||
let result = (self.lsn.align(), recordbuf);
|
||||
Ok(Some(result))
|
||||
// deal with xlog_switch records
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,24 +10,24 @@ use crate::restore_local_repo;
|
||||
use crate::tenant_mgr;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{bail, Error, Result};
|
||||
use anyhow::{Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
use postgres::replication::ReplicationIter;
|
||||
use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::cell::Cell;
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use std::thread::JoinHandle;
|
||||
use std::thread_local;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use tracing::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
@@ -37,7 +37,6 @@ use zenith_utils::zid::ZTimelineId;
|
||||
//
|
||||
struct WalReceiverEntry {
|
||||
wal_producer_connstr: String,
|
||||
wal_receiver_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -45,26 +44,6 @@ lazy_static! {
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
// Boolean that is true only for WAL receiver threads
|
||||
//
|
||||
// This is used in `wait_lsn` to guard against usage that might lead to a deadlock.
|
||||
pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
|
||||
}
|
||||
|
||||
// Wait for walreceiver to stop
|
||||
// Now it stops when pageserver shutdown is requested.
|
||||
// In future we can make this more granular and send shutdown signals
|
||||
// per tenant/timeline to cancel inactive walreceivers.
|
||||
// TODO deal with blocking pg connections
|
||||
pub fn stop_wal_receiver(timelineid: ZTimelineId) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
if let Some(r) = receivers.get_mut(&timelineid) {
|
||||
r.wal_receiver_handle.take();
|
||||
// r.wal_receiver_handle.take().map(JoinHandle::join);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -79,19 +58,21 @@ pub fn launch_wal_receiver(
|
||||
receiver.wal_producer_connstr = wal_producer_connstr.into();
|
||||
}
|
||||
None => {
|
||||
let wal_receiver_handle = thread::Builder::new()
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Also launch a new thread to handle this connection
|
||||
//
|
||||
// NOTE: This thread name is checked in the assertion in wait_lsn. If you change
|
||||
// this, make sure you update the assertion too.
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
IS_WAL_RECEIVER.with(|c| c.set(true));
|
||||
thread_main(conf, timelineid, tenantid);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
wal_receiver_handle: Some(wal_receiver_handle),
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -111,14 +92,16 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
|
||||
// This is the entry point for the WAL receiver thread.
|
||||
//
|
||||
fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
|
||||
let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
info!("WAL receiver thread started");
|
||||
info!(
|
||||
"WAL receiver thread started for timeline : '{}'",
|
||||
timelineid
|
||||
);
|
||||
|
||||
//
|
||||
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
|
||||
// and start streaming WAL from it. If the connection is lost, keep retrying.
|
||||
//
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
loop {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
|
||||
@@ -132,11 +115,10 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
|
||||
sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
debug!("WAL streaming shut down");
|
||||
}
|
||||
|
||||
fn walreceiver_main(
|
||||
_conf: &PageServerConf,
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
tenantid: ZTenantId,
|
||||
@@ -176,15 +158,15 @@ fn walreceiver_main(
|
||||
let mut startpoint = last_rec_lsn;
|
||||
|
||||
if startpoint == Lsn(0) {
|
||||
bail!("No previous WAL position");
|
||||
error!("No previous WAL position");
|
||||
}
|
||||
|
||||
// There might be some padding after the last full record, skip it.
|
||||
startpoint += startpoint.calc_padding(8u32);
|
||||
|
||||
info!(
|
||||
"last_record_lsn {} starting replication from {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, end_of_wal
|
||||
"last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, timelineid, end_of_wal
|
||||
);
|
||||
|
||||
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
|
||||
@@ -206,38 +188,34 @@ fn walreceiver_main(
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
let prev_last_rec_lsn = last_rec_lsn;
|
||||
|
||||
trace!("received XLogData between {} and {}", startlsn, endlsn);
|
||||
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let _enter = info_span!("processing record", lsn = %lsn).entered();
|
||||
// Save old checkpoint value to compare with it after decoding WAL record
|
||||
let old_checkpoint_bytes = checkpoint.encode();
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hittind a deadlock.
|
||||
assert!(lsn.is_aligned());
|
||||
|
||||
let writer = timeline.writer();
|
||||
|
||||
let mut checkpoint_modified = false;
|
||||
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
restore_local_repo::save_decoded_record(
|
||||
&mut checkpoint,
|
||||
&mut checkpoint_modified,
|
||||
writer.as_ref(),
|
||||
&*timeline,
|
||||
&decoded,
|
||||
recdata,
|
||||
lsn,
|
||||
)?;
|
||||
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
// Check if checkpoint data was updated by save_decoded_record
|
||||
if checkpoint_modified {
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
|
||||
writer.put_page_image(
|
||||
if new_checkpoint_bytes != old_checkpoint_bytes {
|
||||
timeline.put_page_image(
|
||||
RelishTag::Checkpoint,
|
||||
0,
|
||||
lsn,
|
||||
@@ -247,10 +225,38 @@ fn walreceiver_main(
|
||||
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
last_rec_lsn = lsn;
|
||||
}
|
||||
|
||||
// Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
|
||||
// "checkpoint" the repository to flush all the changes from WAL we've processed
|
||||
// so far to disk. After this, we don't need the original WAL anymore, and it
|
||||
// can be removed. This is probably too aggressive for production, but it's useful
|
||||
// to expose bugs now.
|
||||
//
|
||||
// TODO: We don't actually dare to remove the WAL. It's useful for debugging,
|
||||
// and we might it for logical decoding other things in the future. Although
|
||||
// we should also be able to fetch it back from the WAL safekeepers or S3 if
|
||||
// needed.
|
||||
if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
!= last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
{
|
||||
info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
|
||||
let (oldest_segno, newest_segno) = find_wal_file_range(
|
||||
conf,
|
||||
&timelineid,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
last_rec_lsn,
|
||||
&tenantid,
|
||||
)?;
|
||||
|
||||
if newest_segno - oldest_segno >= 10 {
|
||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {}", endlsn);
|
||||
caught_up = true;
|
||||
@@ -272,7 +278,7 @@ fn walreceiver_main(
|
||||
);
|
||||
|
||||
if reply_requested {
|
||||
Some(last_rec_lsn)
|
||||
Some(timeline.get_last_record_lsn())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -292,15 +298,51 @@ fn walreceiver_main(
|
||||
|
||||
physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
|
||||
}
|
||||
|
||||
if tenant_mgr::shutdown_requested() {
|
||||
debug!("stop walreceiver because pageserver shutdown is requested");
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_wal_file_range(
|
||||
conf: &PageServerConf,
|
||||
timeline: &ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
written_upto: Lsn,
|
||||
tenant: &ZTenantId,
|
||||
) -> Result<(u64, u64)> {
|
||||
let written_upto_segno = written_upto.segment_number(wal_seg_size);
|
||||
|
||||
let mut oldest_segno = written_upto_segno;
|
||||
let mut newest_segno = written_upto_segno;
|
||||
// Scan the wal directory, and count how many WAL filed we could remove
|
||||
let wal_dir = conf.wal_dir_path(timeline, tenant);
|
||||
for entry in fs::read_dir(wal_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let filename = path.file_name().unwrap().to_str().unwrap();
|
||||
|
||||
if IsXLogFileName(filename) {
|
||||
let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
|
||||
|
||||
if segno > written_upto_segno {
|
||||
// that's strange.
|
||||
warn!("there is a WAL file from future at {}", path.display());
|
||||
continue;
|
||||
}
|
||||
|
||||
oldest_segno = min(oldest_segno, segno);
|
||||
newest_segno = max(newest_segno, segno);
|
||||
}
|
||||
}
|
||||
// FIXME: would be good to assert that there are no gaps in the WAL files
|
||||
|
||||
Ok((oldest_segno, newest_segno))
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
///
|
||||
/// See the [postgres docs] for more details.
|
||||
|
||||
@@ -22,7 +22,8 @@ use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use serde::Serialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
@@ -59,7 +60,7 @@ use postgres_ffi::XLogRecord;
|
||||
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
|
||||
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
@@ -82,7 +83,7 @@ pub trait WalRedoManager: Send + Sync {
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError>;
|
||||
}
|
||||
|
||||
@@ -99,7 +100,7 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
_blknum: u32,
|
||||
_lsn: Lsn,
|
||||
_base_img: Option<Bytes>,
|
||||
_records: Vec<(Lsn, WALRecord)>,
|
||||
_records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
Err(WalRedoError::InvalidState)
|
||||
}
|
||||
@@ -150,7 +151,7 @@ struct WalRedoRequest {
|
||||
lsn: Lsn,
|
||||
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
records: Vec<WALRecord>,
|
||||
}
|
||||
|
||||
/// An error happened in WAL redo
|
||||
@@ -179,7 +180,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let start_time;
|
||||
let lock_time;
|
||||
@@ -205,7 +206,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
.block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
|
||||
*process_guard = Some(p);
|
||||
}
|
||||
let process = process_guard.as_mut().unwrap();
|
||||
let process = (*process_guard).as_ref().unwrap();
|
||||
|
||||
self.runtime
|
||||
.block_on(self.handle_apply_request(process, &request))
|
||||
@@ -246,7 +247,7 @@ impl PostgresRedoManager {
|
||||
///
|
||||
async fn handle_apply_request(
|
||||
&self,
|
||||
process: &mut PostgresRedoProcess,
|
||||
process: &PostgresRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let rel = request.rel;
|
||||
@@ -277,7 +278,7 @@ impl PostgresRedoManager {
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
// Apply all collected WAL records
|
||||
for (_lsn, record) in records {
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
@@ -298,11 +299,9 @@ impl PostgresRedoManager {
|
||||
// Transaction manager stuff
|
||||
let rec_segno = match rel {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
assert!(
|
||||
slru == SlruKind::Clog,
|
||||
"Not valid XACT relish tag {:?}",
|
||||
rel
|
||||
);
|
||||
if slru != SlruKind::Clog {
|
||||
panic!("Not valid XACT relish tag {:?}", rel);
|
||||
}
|
||||
segno
|
||||
}
|
||||
_ => panic!("Not valid XACT relish tag {:?}", rel),
|
||||
@@ -422,7 +421,7 @@ impl PostgresRedoManager {
|
||||
);
|
||||
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {:#}", e);
|
||||
error!("could not apply WAL records: {}", e);
|
||||
result = Err(WalRedoError::IoError(e));
|
||||
} else {
|
||||
let img = apply_result.unwrap();
|
||||
@@ -439,8 +438,8 @@ impl PostgresRedoManager {
|
||||
/// Handle to the Postgres WAL redo process
|
||||
///
|
||||
struct PostgresRedoProcess {
|
||||
stdin: ChildStdin,
|
||||
stdout: ChildStdout,
|
||||
stdin: RefCell<ChildStdin>,
|
||||
stdout: RefCell<ChildStdout>,
|
||||
}
|
||||
|
||||
impl PostgresRedoProcess {
|
||||
@@ -460,7 +459,7 @@ impl PostgresRedoProcess {
|
||||
if datadir.exists() {
|
||||
info!("directory {:?} exists, removing", &datadir);
|
||||
if let Err(e) = fs::remove_dir_all(&datadir) {
|
||||
error!("could not remove old wal-redo-datadir: {:#}", e);
|
||||
error!("could not remove old wal-redo-datadir: {:?}", e);
|
||||
}
|
||||
}
|
||||
info!("running initdb in {:?}", datadir.display());
|
||||
@@ -533,7 +532,10 @@ impl PostgresRedoProcess {
|
||||
};
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
Ok(PostgresRedoProcess { stdin, stdout })
|
||||
Ok(PostgresRedoProcess {
|
||||
stdin: RefCell::new(stdin),
|
||||
stdout: RefCell::new(stdout),
|
||||
})
|
||||
}
|
||||
|
||||
//
|
||||
@@ -541,14 +543,13 @@ impl PostgresRedoProcess {
|
||||
// new page image.
|
||||
//
|
||||
async fn apply_wal_records(
|
||||
&mut self,
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, WALRecord)],
|
||||
records: &[WALRecord],
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
let stdout = &mut self.stdout;
|
||||
// Buffer the writes to avoid a lot of small syscalls.
|
||||
let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);
|
||||
let mut stdin = self.stdin.borrow_mut();
|
||||
let mut stdout = self.stdout.borrow_mut();
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
@@ -565,16 +566,22 @@ impl PostgresRedoProcess {
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
if let Some(img) = base_img {
|
||||
timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??;
|
||||
if base_img.is_some() {
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
)
|
||||
.await??;
|
||||
}
|
||||
|
||||
// Send WAL records.
|
||||
for (lsn, rec) in records.iter() {
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(*lsn, &rec.rec))
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.await?;
|
||||
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
@@ -611,41 +618,58 @@ impl PostgresRedoProcess {
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec<u8> {
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
|
||||
tag.ser_into(&mut buf)
|
||||
// FIXME: this is a temporary hack that should go away when we refactor
|
||||
// the postgres protocol serialization + handlers.
|
||||
//
|
||||
// BytesMut is a dynamic growable buffer, used a lot in tokio code but
|
||||
// not in the std library. To write to a BytesMut from a serde serializer,
|
||||
// we need to either:
|
||||
// - pre-allocate the required buffer space. This is annoying because we
|
||||
// shouldn't care what the exact serialized size is-- that's the
|
||||
// serializer's job.
|
||||
// - Or, we need to create a temporary "writer" (which implements the
|
||||
// `Write` trait). It's a bit awkward, because the writer consumes the
|
||||
// underlying BytesMut, and we need to extract it later with
|
||||
// `into_inner`.
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec<u8> {
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(&mut buf)
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let mut buf = writer.into_inner();
|
||||
buf.put(base_img);
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf: Vec<u8> = Vec::with_capacity(1 + len);
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
@@ -654,19 +678,21 @@ fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag) -> Vec<u8> {
|
||||
fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(&mut buf)
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
@@ -329,12 +329,7 @@ pub fn main() {
|
||||
}
|
||||
|
||||
impl XLogRecord {
|
||||
pub fn from_slice(buf: &[u8]) -> XLogRecord {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
XLogRecord::des(buf).unwrap()
|
||||
}
|
||||
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogRecord {
|
||||
pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
XLogRecord::des_from(&mut buf.reader()).unwrap()
|
||||
}
|
||||
@@ -382,12 +377,10 @@ impl CheckPoint {
|
||||
Ok(CheckPoint::des(buf)?)
|
||||
}
|
||||
|
||||
/// Update next XID based on provided new_xid and stored epoch.
|
||||
/// Next XID should be greater than new_xid. This handles 32-bit
|
||||
/// XID wraparound correctly.
|
||||
///
|
||||
/// Returns 'true' if the XID was updated.
|
||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
// Update next XID based on provided new_xid and stored epoch.
|
||||
// Next XID should be greater than new_xid.
|
||||
// Also take in account 32-bit wrap-around.
|
||||
pub fn update_next_xid(&mut self, xid: u32) {
|
||||
let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
|
||||
let full_xid = self.nextXid.value;
|
||||
let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
@@ -398,14 +391,10 @@ impl CheckPoint {
|
||||
// wrap-around
|
||||
epoch += 1;
|
||||
}
|
||||
let nextXid = (epoch << 32) | new_xid as u64;
|
||||
|
||||
if nextXid != self.nextXid.value {
|
||||
self.nextXid = FullTransactionId { value: nextXid };
|
||||
return true;
|
||||
}
|
||||
self.nextXid = FullTransactionId {
|
||||
value: (epoch << 32) | new_xid as u64,
|
||||
};
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,5 @@ tokio = "1.11"
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
clap = "2.33.0"
|
||||
rustls = "0.19.1"
|
||||
reqwest = { version = "0.11", features = ["blocking", "json"] }
|
||||
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
@@ -1,14 +1,17 @@
|
||||
use anyhow::{bail, Context, Result};
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::net::{SocketAddr, ToSocketAddrs};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
net::{IpAddr, SocketAddr},
|
||||
};
|
||||
|
||||
pub struct CPlaneApi {
|
||||
auth_endpoint: &'static str,
|
||||
// address: SocketAddr,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: String,
|
||||
pub host: IpAddr, // TODO: allow host name here too
|
||||
pub port: u16,
|
||||
pub dbname: String,
|
||||
pub user: String,
|
||||
@@ -16,13 +19,8 @@ pub struct DatabaseInfo {
|
||||
}
|
||||
|
||||
impl DatabaseInfo {
|
||||
pub fn socket_addr(&self) -> Result<SocketAddr> {
|
||||
let host_port = format!("{}:{}", self.host, self.port);
|
||||
host_port
|
||||
.to_socket_addrs()
|
||||
.with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr"))
|
||||
pub fn socket_addr(&self) -> SocketAddr {
|
||||
SocketAddr::new(self.host, self.port)
|
||||
}
|
||||
|
||||
pub fn conn_string(&self) -> String {
|
||||
@@ -33,35 +31,62 @@ impl DatabaseInfo {
|
||||
}
|
||||
}
|
||||
|
||||
// mock cplane api
|
||||
impl CPlaneApi {
|
||||
pub fn new(auth_endpoint: &'static str) -> CPlaneApi {
|
||||
CPlaneApi { auth_endpoint }
|
||||
pub fn new(_address: &SocketAddr) -> CPlaneApi {
|
||||
CPlaneApi {
|
||||
// address: address.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn authenticate_proxy_request(
|
||||
&self,
|
||||
user: &str,
|
||||
database: &str,
|
||||
md5_response: &[u8],
|
||||
salt: &[u8; 4],
|
||||
) -> Result<DatabaseInfo> {
|
||||
let mut url = reqwest::Url::parse(self.auth_endpoint)?;
|
||||
url.query_pairs_mut()
|
||||
.append_pair("login", user)
|
||||
.append_pair("database", database)
|
||||
.append_pair("md5response", std::str::from_utf8(md5_response)?)
|
||||
.append_pair("salt", &hex::encode(salt));
|
||||
pub fn check_auth(&self, user: &str, md5_response: &[u8], salt: &[u8; 4]) -> Result<()> {
|
||||
// passwords for both is "mypass"
|
||||
let auth_map: HashMap<_, &str> = vec![
|
||||
("stas@zenith", "716ee6e1c4a9364d66285452c47402b1"),
|
||||
("stas2@zenith", "3996f75df64c16a8bfaf01301b61d582"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
println!("cplane request: {}", url.as_str());
|
||||
let stored_hash = auth_map
|
||||
.get(&user)
|
||||
.ok_or_else(|| anyhow::Error::msg("user not found"))?;
|
||||
let salted_stored_hash = format!(
|
||||
"md5{:x}",
|
||||
md5::compute([stored_hash.as_bytes(), salt].concat())
|
||||
);
|
||||
|
||||
let resp = reqwest::blocking::get(url)?;
|
||||
let received_hash = std::str::from_utf8(md5_response)?;
|
||||
|
||||
if resp.status().is_success() {
|
||||
let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?;
|
||||
println!("got conn info: #{:?}", conn_info);
|
||||
Ok(conn_info)
|
||||
println!(
|
||||
"auth: {} rh={} sh={} ssh={} {:?}",
|
||||
user, received_hash, stored_hash, salted_stored_hash, salt
|
||||
);
|
||||
|
||||
if received_hash == salted_stored_hash {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("Auth failed")
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_database_uri(&self, _user: &str, _database: &str) -> Result<DatabaseInfo> {
|
||||
Ok(DatabaseInfo {
|
||||
host: "127.0.0.1".parse()?,
|
||||
port: 5432,
|
||||
dbname: "stas".to_string(),
|
||||
user: "stas".to_string(),
|
||||
password: "mypass".to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
// pub fn create_database(&self, _user: &String, _database: &String) -> Result<DatabaseInfo> {
|
||||
// Ok(DatabaseInfo {
|
||||
// host: "127.0.0.1".parse()?,
|
||||
// port: 5432,
|
||||
// dbname: "stas".to_string(),
|
||||
// user: "stas".to_string(),
|
||||
// password: "mypass".to_string(),
|
||||
// })
|
||||
// }
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ pub struct ProxyConf {
|
||||
pub redirect_uri: String,
|
||||
|
||||
/// control plane address where we would check auth.
|
||||
pub auth_endpoint: String,
|
||||
pub cplane_address: SocketAddr,
|
||||
|
||||
pub ssl_config: Option<Arc<ServerConfig>>,
|
||||
}
|
||||
@@ -56,7 +56,8 @@ fn configure_ssl(arg_matches: &ArgMatches) -> anyhow::Result<Option<Arc<ServerCo
|
||||
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("SSL key file")?;
|
||||
let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
let mut keys = pemfile::rsa_private_keys(&mut &key_bytes[..])
|
||||
.or_else(|_| pemfile::pkcs8_private_keys(&mut &key_bytes[..]))
|
||||
.map_err(|_| anyhow!("couldn't read TLS keys"))?;
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().unwrap()
|
||||
@@ -99,15 +100,7 @@ fn main() -> anyhow::Result<()> {
|
||||
.long("uri")
|
||||
.takes_value(true)
|
||||
.help("redirect unauthenticated users to given uri")
|
||||
.default_value("http://127.0.0.1:3000/psql_session/"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("auth-endpoint")
|
||||
.short("a")
|
||||
.long("auth-endpoint")
|
||||
.takes_value(true)
|
||||
.help("redirect unauthenticated users to given uri")
|
||||
.default_value("http://127.0.0.1:3000/authenticate_proxy_request/"),
|
||||
.default_value("http://localhost:3000/psql_session/"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("ssl-key")
|
||||
@@ -129,7 +122,7 @@ fn main() -> anyhow::Result<()> {
|
||||
proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
|
||||
mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
|
||||
redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
|
||||
auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
|
||||
cplane_address: "127.0.0.1:3000".parse()?,
|
||||
ssl_config: configure_ssl(&arg_matches)?,
|
||||
};
|
||||
let state = ProxyState {
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::{
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::Bytes;
|
||||
use serde::Deserialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::{
|
||||
postgres_backend::{self, query_from_cstring, AuthType, PostgresBackend},
|
||||
pq_proto::{BeMessage, SINGLE_COL_ROWDESC},
|
||||
@@ -34,7 +34,7 @@ pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow:
|
||||
|
||||
pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
let mut conn_handler = MgmtHandler { state };
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ struct MgmtHandler {
|
||||
// "host": "127.0.0.1",
|
||||
// "port": 5432,
|
||||
// "dbname": "stas",
|
||||
// "user": "stas",
|
||||
// "user": "stas"
|
||||
// "password": "mypass"
|
||||
// }
|
||||
// }
|
||||
@@ -60,16 +60,13 @@ struct MgmtHandler {
|
||||
// "Failure": "oops"
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // to test manually by sending a query to mgmt interface:
|
||||
// psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
|
||||
#[derive(Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct PsqlSessionResponse {
|
||||
session_id: String,
|
||||
result: PsqlSessionResult,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum PsqlSessionResult {
|
||||
Success(DatabaseInfo),
|
||||
Failure(String),
|
||||
@@ -81,47 +78,34 @@ impl postgres_backend::Handler for MgmtHandler {
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
let res = try_process_query(self, pgb, query_string);
|
||||
// intercept and log error message
|
||||
if res.is_err() {
|
||||
println!("Mgmt query failed: #{:?}", res);
|
||||
}
|
||||
res
|
||||
}
|
||||
}
|
||||
let query_string = query_from_cstring(query_string);
|
||||
|
||||
fn try_process_query(
|
||||
mgmt: &mut MgmtHandler,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
let query_string = query_from_cstring(query_string);
|
||||
println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
|
||||
|
||||
println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
|
||||
let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
|
||||
|
||||
let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
|
||||
let waiters = self.state.waiters.lock().unwrap();
|
||||
|
||||
let waiters = mgmt.state.waiters.lock().unwrap();
|
||||
let sender = waiters
|
||||
.get(&resp.session_id)
|
||||
.ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
|
||||
|
||||
let sender = waiters
|
||||
.get(&resp.session_id)
|
||||
.ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
|
||||
match resp.result {
|
||||
PsqlSessionResult::Success(db_info) => {
|
||||
sender.send(Ok(db_info))?;
|
||||
|
||||
match resp.result {
|
||||
PsqlSessionResult::Success(db_info) => {
|
||||
sender.send(Ok(db_info))?;
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
PsqlSessionResult::Failure(message) => {
|
||||
sender.send(Err(anyhow::Error::msg(message.clone())))?;
|
||||
|
||||
PsqlSessionResult::Failure(message) => {
|
||||
sender.send(Err(anyhow::Error::msg(message.clone())))?;
|
||||
|
||||
bail!("psql session request failed: {}", message)
|
||||
bail!("psql session request failed: {}", message)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,14 +57,13 @@ pub fn proxy_conn_main(
|
||||
) -> anyhow::Result<()> {
|
||||
let mut conn = ProxyConnection {
|
||||
state,
|
||||
cplane: CPlaneApi::new(&state.conf.auth_endpoint),
|
||||
cplane: CPlaneApi::new(&state.conf.cplane_address),
|
||||
user: "".into(),
|
||||
database: "".into(),
|
||||
pgb: PostgresBackend::new(
|
||||
socket,
|
||||
postgres_backend::AuthType::MD5,
|
||||
state.conf.ssl_config.clone(),
|
||||
false,
|
||||
)?,
|
||||
md5_salt: [0u8; 4],
|
||||
psql_session_id: "".into(),
|
||||
@@ -81,8 +80,6 @@ pub fn proxy_conn_main(
|
||||
conn.handle_new_user()?
|
||||
};
|
||||
|
||||
// XXX: move that inside handle_new_user/handle_existing_user to be able to
|
||||
// report wrong connection error.
|
||||
proxy_pass(conn.pgb, db_info)
|
||||
}
|
||||
|
||||
@@ -175,31 +172,21 @@ impl ProxyConnection {
|
||||
.split_last()
|
||||
.ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;
|
||||
|
||||
match self.cplane.authenticate_proxy_request(
|
||||
self.user.as_str(),
|
||||
self.database.as_str(),
|
||||
md5_response,
|
||||
&self.md5_salt,
|
||||
) {
|
||||
Err(e) => {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
|
||||
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
Ok(conn_info) => {
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
Ok(conn_info)
|
||||
}
|
||||
if let Err(e) = self.check_auth_md5(md5_response) {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
|
||||
bail!("auth failed: {}", e);
|
||||
} else {
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
} else {
|
||||
bail!("protocol violation");
|
||||
}
|
||||
|
||||
// ok, we are authorized
|
||||
self.cplane.get_database_uri(&self.user, &self.database)
|
||||
}
|
||||
|
||||
fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
@@ -245,11 +232,17 @@ databases without opening the browser.
|
||||
|
||||
Ok(dbinfo)
|
||||
}
|
||||
|
||||
fn check_auth_md5(&self, md5_response: &[u8]) -> anyhow::Result<()> {
|
||||
assert!(self.is_existing_user());
|
||||
self.cplane
|
||||
.check_auth(self.user.as_str(), md5_response, &self.md5_salt)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
|
||||
async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?;
|
||||
let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
|
||||
let _ = config.connect_raw(&mut socket, NoTls).await?;
|
||||
Ok(socket)
|
||||
@@ -280,9 +273,7 @@ fn proxy(
|
||||
|
||||
/// Proxy a client connection to a postgres database
|
||||
fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
let runtime = tokio::runtime::Builder::new_current_thread().build()?;
|
||||
let db_stream = runtime.block_on(connect_to_db(db_info))?;
|
||||
let db_stream = db_stream.into_std()?;
|
||||
db_stream.set_nonblocking(false)?;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import subprocess
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -13,7 +13,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
zenith_cli.run(["branch", "test_branch_behind", "empty"])
|
||||
|
||||
pgmain = postgres.create_start('test_branch_behind')
|
||||
log.info("postgres is running on 'test_branch_behind' branch")
|
||||
print("postgres is running on 'test_branch_behind' branch")
|
||||
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
@@ -27,17 +27,17 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_a = main_cur.fetchone()[0]
|
||||
log.info(f'LSN after 100 rows: {lsn_a}')
|
||||
print('LSN after 100 rows: ' + lsn_a)
|
||||
|
||||
# Insert some more rows. (This generates enough WAL to fill a few segments.)
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 200000) g
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
log.info(f'LSN after 200100 rows: {lsn_b}')
|
||||
print('LSN after 100100 rows: ' + lsn_b)
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
|
||||
@@ -46,15 +46,15 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 200000) g
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
log.info(f'LSN after 400100 rows: {lsn_c}')
|
||||
print('LSN after 200100 rows: ' + lsn_c)
|
||||
|
||||
# Branch at the point where only 200100 rows were inserted
|
||||
# Branch at the point where only 200 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
|
||||
|
||||
pg_hundred = postgres.create_start("test_branch_behind_hundred")
|
||||
@@ -70,11 +70,11 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
more_pg_conn = pg_more.connect()
|
||||
more_cur = more_pg_conn.cursor()
|
||||
more_cur.execute('SELECT count(*) FROM foo')
|
||||
assert more_cur.fetchone() == (200100, )
|
||||
assert more_cur.fetchone() == (100100, )
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (400100, )
|
||||
assert main_cur.fetchone() == (200100, )
|
||||
|
||||
# Check bad lsn's for branching
|
||||
|
||||
@@ -89,4 +89,4 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
try:
|
||||
zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
|
||||
except subprocess.CalledProcessError:
|
||||
log.info("Branch creation with pre-initdb LSN failed (as expected)")
|
||||
print("Branch creation with pre-initdb LSN failed (as expected)")
|
||||
|
||||
@@ -4,7 +4,6 @@ import os
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -25,7 +24,7 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
]
|
||||
|
||||
pg = postgres.create_start('test_clog_truncate', config_lines=config)
|
||||
log.info('postgres is running on test_clog_truncate branch')
|
||||
print('postgres is running on test_clog_truncate branch')
|
||||
|
||||
# Install extension containing function needed for test
|
||||
pg.safe_psql('CREATE EXTENSION zenith_test_utils')
|
||||
@@ -34,22 +33,22 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('select test_consume_xids(1000*1000*10);')
|
||||
log.info('xids consumed')
|
||||
print('xids consumed')
|
||||
|
||||
# call a checkpoint to trigger TruncateSubtrans
|
||||
cur.execute('CHECKPOINT;')
|
||||
|
||||
# ensure WAL flush
|
||||
cur.execute('select txid_current()')
|
||||
log.info(cur.fetchone())
|
||||
print(cur.fetchone())
|
||||
|
||||
# wait for autovacuum to truncate the pg_xact
|
||||
# XXX Is it worth to add a timeout here?
|
||||
pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), '0000')
|
||||
log.info(f"pg_xact_0000_path = {pg_xact_0000_path}")
|
||||
print("pg_xact_0000_path = " + pg_xact_0000_path)
|
||||
|
||||
while os.path.isfile(pg_xact_0000_path):
|
||||
log.info(f"file exists. wait for truncation. " "pg_xact_0000_path = {pg_xact_0000_path}")
|
||||
print("file exists. wait for truncation. " "pg_xact_0000_path = " + pg_xact_0000_path)
|
||||
time.sleep(5)
|
||||
|
||||
# checkpoint to advance latest lsn
|
||||
@@ -60,14 +59,14 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
lsn_after_truncation = cur.fetchone()[0]
|
||||
|
||||
# create new branch after clog truncation and start a compute node on it
|
||||
log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
|
||||
print('create branch at lsn_after_truncation ' + lsn_after_truncation)
|
||||
zenith_cli.run(
|
||||
["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation])
|
||||
|
||||
pg2 = postgres.create_start('test_clog_truncate_new')
|
||||
log.info('postgres is running on test_clog_truncate_new branch')
|
||||
print('postgres is running on test_clog_truncate_new branch')
|
||||
|
||||
# check that new node doesn't contain truncated segment
|
||||
pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), '0000')
|
||||
log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}")
|
||||
print("pg_xact_0000_path_new = " + pg_xact_0000_path_new)
|
||||
assert os.path.isfile(pg_xact_0000_path_new) is False
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -15,7 +14,7 @@ def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFact
|
||||
|
||||
# change config
|
||||
pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
|
||||
log.info('postgres is running on test_config branch')
|
||||
print('postgres is running on test_config branch')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
@@ -3,7 +3,6 @@ import pathlib
|
||||
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -20,7 +19,7 @@ def test_createdb(
|
||||
zenith_cli.run(["branch", "test_createdb", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_createdb')
|
||||
log.info("postgres is running on 'test_createdb' branch")
|
||||
print("postgres is running on 'test_createdb' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -54,7 +53,7 @@ def test_dropdb(
|
||||
zenith_cli.run(["branch", "test_dropdb", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_dropdb')
|
||||
log.info("postgres is running on 'test_dropdb' branch")
|
||||
print("postgres is running on 'test_dropdb' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -89,13 +88,13 @@ def test_dropdb(
|
||||
|
||||
# Test that database subdir exists on the branch before drop
|
||||
dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid)
|
||||
log.info(dbpath)
|
||||
print(dbpath)
|
||||
|
||||
assert os.path.isdir(dbpath) == True
|
||||
|
||||
# Test that database subdir doesn't exist on the branch after drop
|
||||
dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid)
|
||||
log.info(dbpath)
|
||||
print(dbpath)
|
||||
|
||||
assert os.path.isdir(dbpath) == False
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -13,7 +12,7 @@ def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: Postgres
|
||||
zenith_cli.run(["branch", "test_createuser", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_createuser')
|
||||
log.info("postgres is running on 'test_createuser' branch")
|
||||
print("postgres is running on 'test_createuser' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -16,7 +15,7 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
|
||||
zenith_cli.run(["branch", "test_multixact", "empty"])
|
||||
pg = postgres.create_start('test_multixact')
|
||||
|
||||
log.info("postgres is running on 'test_multixact' branch")
|
||||
print("postgres is running on 'test_multixact' branch")
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
@@ -56,7 +55,7 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
|
||||
zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
|
||||
pg_new = postgres.create_start('test_multixact_new')
|
||||
|
||||
log.info("postgres is running on 'test_multixact_new' branch")
|
||||
print("postgres is running on 'test_multixact_new' branch")
|
||||
pg_new_conn = pg_new.connect()
|
||||
cur_new = pg_new_conn.cursor()
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -19,7 +18,7 @@ def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_old_request_lsn", "empty"])
|
||||
pg = postgres.create_start('test_old_request_lsn')
|
||||
log.info('postgres is running on test_old_request_lsn branch')
|
||||
print('postgres is running on test_old_request_lsn branch')
|
||||
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
@@ -47,7 +46,7 @@ def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
|
||||
from pg_settings where name = 'shared_buffers'
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
log.info(f'shared_buffers is {row[0]}, table size {row[1]}');
|
||||
print("shared_buffers is {}, table size {}", row[0], row[1]);
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
cur.execute('VACUUM foo');
|
||||
|
||||
@@ -5,7 +5,6 @@ import time
|
||||
from contextlib import closing
|
||||
from multiprocessing import Process, Value
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -41,7 +40,7 @@ def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres:
|
||||
from pg_settings where name = 'shared_buffers'
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
log.info(f"shared_buffers is {row[0]}, table size {row[1]}");
|
||||
print("shared_buffers is {}, table size {}", row[0], row[1]);
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
# Stop and restart pageserver. This is a more or less graceful shutdown, although
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from fixtures.zenith_fixtures import PostgresFactory
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -9,9 +8,9 @@ def test_pgbench(postgres: PostgresFactory, pg_bin, zenith_cli):
|
||||
zenith_cli.run(["branch", "test_pgbench", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_pgbench')
|
||||
log.info("postgres is running on 'test_pgbench' branch")
|
||||
print("postgres is running on 'test_pgbench' branch")
|
||||
|
||||
connstr = pg.connstr()
|
||||
|
||||
pg_bin.run_capture(['pgbench', '-i', connstr])
|
||||
pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr])
|
||||
pg_bin.run_capture(['pgbench', '-i', '-s', '100', connstr])
|
||||
pg_bin.run_capture(['pgbench'] + '-c 1 -N -T 100 -P 1 -M prepared'.split() + [connstr])
|
||||
|
||||
@@ -2,7 +2,6 @@ import pytest
|
||||
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -10,7 +9,10 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
#
|
||||
# Test restarting and recreating a postgres instance
|
||||
#
|
||||
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
|
||||
# XXX: with_wal_acceptors=True fails now, would be fixed with
|
||||
# `postgres --sync-walkeepers` patches.
|
||||
#
|
||||
@pytest.mark.parametrize('with_wal_acceptors', [False])
|
||||
def test_restart_compute(
|
||||
zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
@@ -28,7 +30,7 @@ def test_restart_compute(
|
||||
|
||||
pg = postgres.create_start('test_restart_compute',
|
||||
wal_acceptors=wal_acceptor_connstrs)
|
||||
log.info("postgres is running on 'test_restart_compute' branch")
|
||||
print("postgres is running on 'test_restart_compute' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -37,7 +39,7 @@ def test_restart_compute(
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
r = cur.fetchone()
|
||||
assert r == (5000050000, )
|
||||
log.info(f"res = {r}")
|
||||
print("res = ", r)
|
||||
|
||||
# Remove data directory and restart
|
||||
pg.stop_and_destroy().create_start('test_restart_compute',
|
||||
@@ -50,7 +52,7 @@ def test_restart_compute(
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
r = cur.fetchone()
|
||||
assert r == (5000050000, )
|
||||
log.info(f"res = {r}")
|
||||
print("res = ", r)
|
||||
|
||||
# Insert another row
|
||||
cur.execute("INSERT INTO t VALUES (100001, 'payload2')")
|
||||
@@ -58,7 +60,7 @@ def test_restart_compute(
|
||||
|
||||
r = cur.fetchone()
|
||||
assert r == (100001, )
|
||||
log.info(f"res = {r}")
|
||||
print("res = ", r)
|
||||
|
||||
# Again remove data directory and restart
|
||||
pg.stop_and_destroy().create_start('test_restart_compute',
|
||||
@@ -73,7 +75,7 @@ def test_restart_compute(
|
||||
|
||||
r = cur.fetchone()
|
||||
assert r == (100001, )
|
||||
log.info(f"res = {r}")
|
||||
print("res = ", r)
|
||||
|
||||
# And again remove data directory and restart
|
||||
pg.stop_and_destroy().create_start('test_restart_compute',
|
||||
@@ -86,4 +88,4 @@ def test_restart_compute(
|
||||
|
||||
r = cur.fetchone()
|
||||
assert r == (100001, )
|
||||
log.info(f"res = {r}")
|
||||
print("res = ", r)
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
from contextlib import closing
|
||||
import psycopg2.extras
|
||||
import time
|
||||
from fixtures.log_helper import log
|
||||
import time;
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def print_gc_result(row):
|
||||
log.info("GC duration {elapsed} ms".format_map(row));
|
||||
log.info(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row))
|
||||
log.info(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row))
|
||||
print("GC duration {elapsed} ms".format_map(row));
|
||||
print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row))
|
||||
print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row))
|
||||
|
||||
|
||||
#
|
||||
@@ -36,7 +35,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
|
||||
cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass");
|
||||
row = cur.fetchone();
|
||||
log.info(f"relfilenode is {row[0]}");
|
||||
print("relfilenode is {}", row[0]);
|
||||
|
||||
# Run GC, to clear out any garbage left behind in the catalogs by
|
||||
# the CREATE TABLE command. We want to have a clean slate with no garbage
|
||||
@@ -51,7 +50,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# update to confuse our numbers either.
|
||||
cur.execute("DELETE FROM foo")
|
||||
|
||||
log.info("Running GC before test")
|
||||
print("Running GC before test")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
@@ -59,46 +58,44 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
layer_relfiles_remain = row['layer_relfiles_total'] - row['layer_relfiles_removed']
|
||||
assert layer_relfiles_remain > 0
|
||||
|
||||
# Insert a row and run GC. Checkpoint should freeze the layer
|
||||
# so that there is only the most recent image layer left for the rel,
|
||||
# removing the old image and delta layer.
|
||||
log.info("Inserting one row and running GC")
|
||||
# Insert a row.
|
||||
print("Inserting one row and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (1)")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
|
||||
assert row['layer_relfiles_removed'] == 2
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
|
||||
assert row['layer_relfiles_removed'] == 1
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Insert two more rows and run GC.
|
||||
# This should create new image and delta layer file with the new contents, and
|
||||
# then remove the old one image and the just-created delta layer.
|
||||
log.info("Inserting two more rows and running GC")
|
||||
# This should create a new layer file with the new contents, and
|
||||
# remove the old one.
|
||||
print("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
|
||||
assert row['layer_relfiles_removed'] == 2
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
|
||||
assert row['layer_relfiles_removed'] == 1
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Do it again. Should again create two new layer files and remove old ones.
|
||||
log.info("Inserting two more rows and running GC")
|
||||
# Do it again. Should again create a new layer file and remove old one.
|
||||
print("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
|
||||
assert row['layer_relfiles_removed'] == 2
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
|
||||
assert row['layer_relfiles_removed'] == 1
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Run GC again, with no changes in the database. Should not remove anything.
|
||||
log.info("Run GC again, with nothing to do")
|
||||
print("Run GC again, with nothing to do")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
@@ -109,26 +106,19 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
#
|
||||
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
|
||||
#
|
||||
log.info("Drop table and run GC again");
|
||||
print("Drop table and run GC again");
|
||||
cur.execute("DROP TABLE foo")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
|
||||
# We still cannot remove the latest layers
|
||||
# because they serve as tombstones for earlier layers.
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
assert row['layer_relfiles_needed_as_tombstone'] == 3
|
||||
assert row['layer_relfiles_dropped'] == 3
|
||||
|
||||
# The catalog updates also create new layer files of the catalogs, which
|
||||
# are counted as 'removed'
|
||||
assert row['layer_relfiles_removed'] > 0
|
||||
|
||||
# TODO Change the test to check actual CG of dropped layers.
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
#assert row['layer_relfiles_dropped'] == 3
|
||||
|
||||
# TODO: perhaps we should count catalog and user relations separately,
|
||||
# to make this kind of testing more robust
|
||||
|
||||
@@ -2,7 +2,7 @@ from contextlib import closing
|
||||
from uuid import UUID
|
||||
import psycopg2.extras
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
def test_timeline_size(
|
||||
zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin
|
||||
@@ -15,7 +15,7 @@ def test_timeline_size(
|
||||
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
|
||||
|
||||
pgmain = postgres.create_start("test_timeline_size")
|
||||
log.info("postgres is running on 'test_timeline_size' branch")
|
||||
print("postgres is running on 'test_timeline_size' branch")
|
||||
|
||||
with closing(pgmain.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, PgBin
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -13,7 +13,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
zenith_cli.run(["branch", "test_twophase", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
|
||||
log.info("postgres is running on 'test_twophase' branch")
|
||||
print("postgres is running on 'test_twophase' branch")
|
||||
|
||||
conn = pg.connect()
|
||||
cur = conn.cursor()
|
||||
@@ -45,7 +45,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
cur.execute('CHECKPOINT')
|
||||
|
||||
twophase_files = os.listdir(pg.pg_twophase_dir_path())
|
||||
log.info(twophase_files)
|
||||
print(twophase_files)
|
||||
assert len(twophase_files) == 4
|
||||
|
||||
cur.execute("COMMIT PREPARED 'insert_three'")
|
||||
@@ -53,7 +53,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
cur.execute('CHECKPOINT')
|
||||
|
||||
twophase_files = os.listdir(pg.pg_twophase_dir_path())
|
||||
log.info(twophase_files)
|
||||
print(twophase_files)
|
||||
assert len(twophase_files) == 2
|
||||
|
||||
# Create a branch with the transaction in prepared state
|
||||
@@ -67,7 +67,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
|
||||
# Check that we restored only needed twophase files
|
||||
twophase_files2 = os.listdir(pg2.pg_twophase_dir_path())
|
||||
log.info(twophase_files2)
|
||||
print(twophase_files2)
|
||||
assert twophase_files2.sort() == twophase_files.sort()
|
||||
|
||||
conn2 = pg2.connect()
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -12,7 +11,7 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
|
||||
zenith_cli.run(["branch", "test_vm_bit_clear", "empty"])
|
||||
pg = postgres.create_start('test_vm_bit_clear')
|
||||
|
||||
log.info("postgres is running on 'test_vm_bit_clear' branch")
|
||||
print("postgres is running on 'test_vm_bit_clear' branch")
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
@@ -64,7 +63,7 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
|
||||
# server at the right point-in-time avoids that full-page image.
|
||||
pg_new = postgres.create_start('test_vm_bit_clear_new')
|
||||
|
||||
log.info("postgres is running on 'test_vm_bit_clear_new' branch")
|
||||
print("postgres is running on 'test_vm_bit_clear_new' branch")
|
||||
pg_new_conn = pg_new.connect()
|
||||
cur_new = pg_new_conn.cursor()
|
||||
|
||||
|
||||
@@ -1,15 +1,10 @@
|
||||
import pytest
|
||||
import random
|
||||
import time
|
||||
import os
|
||||
import subprocess
|
||||
import uuid
|
||||
|
||||
from contextlib import closing
|
||||
from multiprocessing import Process, Value
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory, PgBin
|
||||
from fixtures.utils import lsn_to_hex, mkdir_if_needed
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -203,121 +198,3 @@ def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
|
||||
|
||||
stop_value.value = 1
|
||||
proc.join()
|
||||
|
||||
|
||||
class ProposerPostgres:
|
||||
"""Object for running safekeepers sync with walproposer"""
|
||||
def __init__(self, pgdata_dir: str, pg_bin: PgBin, timeline_id: str, tenant_id: str):
|
||||
self.pgdata_dir: str = pgdata_dir
|
||||
self.pg_bin: PgBin = pg_bin
|
||||
self.timeline_id: str = timeline_id
|
||||
self.tenant_id: str = tenant_id
|
||||
|
||||
def pg_data_dir_path(self) -> str:
|
||||
""" Path to data directory """
|
||||
return self.pgdata_dir
|
||||
|
||||
def config_file_path(self) -> str:
|
||||
""" Path to postgresql.conf """
|
||||
return os.path.join(self.pgdata_dir, 'postgresql.conf')
|
||||
|
||||
def create_dir_config(self, wal_acceptors: str):
|
||||
""" Create dir and config for running --sync-safekeepers """
|
||||
|
||||
mkdir_if_needed(self.pg_data_dir_path())
|
||||
with open(self.config_file_path(), "w") as f:
|
||||
f.writelines([
|
||||
"synchronous_standby_names = 'walproposer'\n",
|
||||
f"zenith.zenith_timeline = '{self.timeline_id}'\n",
|
||||
f"zenith.zenith_tenant = '{self.tenant_id}'\n",
|
||||
f"wal_acceptors = '{wal_acceptors}'\n",
|
||||
])
|
||||
|
||||
def sync_safekeepers(self) -> str:
|
||||
"""
|
||||
Run 'postgres --sync-safekeepers'.
|
||||
Returns execution result, which is commit_lsn after sync.
|
||||
"""
|
||||
|
||||
command = ["postgres", "--sync-safekeepers"]
|
||||
env = {
|
||||
"PGDATA": self.pg_data_dir_path(),
|
||||
}
|
||||
|
||||
basepath = self.pg_bin.run_capture(command, env)
|
||||
stdout_filename = basepath + '.stdout'
|
||||
|
||||
with open(stdout_filename, 'r') as stdout_f:
|
||||
stdout = stdout_f.read()
|
||||
return stdout.strip("\n ")
|
||||
|
||||
|
||||
# insert wal in all safekeepers and run sync on proposer
|
||||
def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorFactory):
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
timeline_id = uuid.uuid4().hex
|
||||
tenant_id = uuid.uuid4().hex
|
||||
|
||||
# write config for proposer
|
||||
pgdata_dir = os.path.join(repo_dir, "proposer_pgdata")
|
||||
pg = ProposerPostgres(pgdata_dir, pg_bin, timeline_id, tenant_id)
|
||||
pg.create_dir_config(wa_factory.get_connstrs())
|
||||
|
||||
# valid lsn, which is not in the segment start, nor in zero segment
|
||||
epoch_start_lsn = 0x16B9188 # 0/16B9188
|
||||
begin_lsn = epoch_start_lsn
|
||||
|
||||
# append and commit WAL
|
||||
lsn_after_append = []
|
||||
for i in range(3):
|
||||
res = wa_factory.instances[i].append_logical_message(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
{
|
||||
"lm_prefix": "prefix",
|
||||
"lm_message": "message",
|
||||
"set_commit_lsn": True,
|
||||
"term": 2,
|
||||
"begin_lsn": begin_lsn,
|
||||
"epoch_start_lsn": epoch_start_lsn,
|
||||
"truncate_lsn": epoch_start_lsn,
|
||||
},
|
||||
)
|
||||
lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"])
|
||||
lsn_after_append.append(lsn_hex)
|
||||
log.info(f"safekeeper[{i}] lsn after append: {lsn_hex}")
|
||||
|
||||
# run sync safekeepers
|
||||
lsn_after_sync = pg.sync_safekeepers()
|
||||
log.info(f"lsn after sync = {lsn_after_sync}")
|
||||
|
||||
assert all(lsn_after_sync == lsn for lsn in lsn_after_append)
|
||||
|
||||
|
||||
def test_timeline_status(zenith_cli, pageserver, postgres, wa_factory: WalAcceptorFactory):
|
||||
wa_factory.start_n_new(1)
|
||||
|
||||
zenith_cli.run(["branch", "test_timeline_status", "empty"])
|
||||
pg = postgres.create_start('test_timeline_status', wal_acceptors=wa_factory.get_connstrs())
|
||||
|
||||
wa = wa_factory.instances[0]
|
||||
wa_http_cli = wa.http_client()
|
||||
wa_http_cli.check_status()
|
||||
|
||||
# learn zenith timeline from compute
|
||||
tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
|
||||
timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
|
||||
|
||||
# fetch something sensible from status
|
||||
epoch = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch
|
||||
|
||||
pg.safe_psql("create table t(i int)")
|
||||
|
||||
# ensure epoch goes up after reboot
|
||||
pg.stop().start()
|
||||
pg.safe_psql("insert into t values(10)")
|
||||
|
||||
epoch_after_reboot = wa_http_cli.timeline_status(tenant_id,
|
||||
timeline_id).acceptor_epoch
|
||||
assert epoch_after_reboot > epoch
|
||||
|
||||
@@ -3,10 +3,9 @@ import asyncpg
|
||||
import random
|
||||
|
||||
from fixtures.zenith_fixtures import WalAcceptor, WalAcceptorFactory, ZenithPageserver, PostgresFactory, Postgres
|
||||
from fixtures.log_helper import getLogger
|
||||
from typing import List
|
||||
from fixtures.utils import debug_print
|
||||
|
||||
log = getLogger('root.wal_acceptor_async')
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
@@ -64,18 +63,18 @@ class WorkerStats(object):
|
||||
self.counters[worker_id] += 1
|
||||
|
||||
def check_progress(self):
|
||||
log.debug("Workers progress: {}".format(self.counters))
|
||||
debug_print("Workers progress: {}".format(self.counters))
|
||||
|
||||
# every worker should finish at least one tx
|
||||
assert all(cnt > 0 for cnt in self.counters)
|
||||
|
||||
progress = sum(self.counters)
|
||||
log.info('All workers made {} transactions'.format(progress))
|
||||
print('All workers made {} transactions'.format(progress))
|
||||
|
||||
|
||||
async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer):
|
||||
pg_conn = await pg.connect_async()
|
||||
log.debug('Started worker {}'.format(worker_id))
|
||||
debug_print('Started worker {}'.format(worker_id))
|
||||
|
||||
while stats.running:
|
||||
from_uid = random.randint(0, n_accounts - 1)
|
||||
@@ -85,9 +84,9 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou
|
||||
await bank_transfer(pg_conn, from_uid, to_uid, amount)
|
||||
stats.inc_progress(worker_id)
|
||||
|
||||
log.debug('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid))
|
||||
debug_print('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid))
|
||||
|
||||
log.debug('Finished worker {}'.format(worker_id))
|
||||
debug_print('Finished worker {}'.format(worker_id))
|
||||
|
||||
await pg_conn.close()
|
||||
|
||||
@@ -119,14 +118,9 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
|
||||
victim = acceptors[it % len(acceptors)]
|
||||
victim.stop()
|
||||
|
||||
# Wait till previous victim recovers so it is ready for the next
|
||||
# iteration by making any writing xact.
|
||||
conn = await pg.connect_async()
|
||||
await conn.execute(
|
||||
'UPDATE bank_accs SET amount = amount WHERE uid = 1',
|
||||
timeout=120
|
||||
)
|
||||
await conn.close()
|
||||
# wait for transactions that could have started and finished before
|
||||
# victim acceptor was stopped
|
||||
await asyncio.sleep(1)
|
||||
|
||||
stats.reset()
|
||||
await asyncio.sleep(period_time)
|
||||
@@ -135,7 +129,7 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
|
||||
|
||||
victim.start()
|
||||
|
||||
log.info('Iterations are finished, exiting coroutines...')
|
||||
print('Iterations are finished, exiting coroutines...')
|
||||
stats.running = False
|
||||
# await all workers
|
||||
await asyncio.gather(*workers)
|
||||
|
||||
@@ -2,7 +2,6 @@ import os
|
||||
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
from fixtures.zenith_fixtures import PageserverPort, PostgresFactory, check_restored_datadir_content
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -39,7 +38,7 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
|
||||
'--inputdir={}'.format(src_path),
|
||||
]
|
||||
|
||||
log.info(pg_regress_command)
|
||||
print(pg_regress_command)
|
||||
env = {
|
||||
'PGPORT': str(pg.port),
|
||||
'PGUSER': pg.username,
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
from pprint import pprint
|
||||
|
||||
import os
|
||||
import re
|
||||
import timeit
|
||||
@@ -134,34 +136,9 @@ class ZenithBenchmarker:
|
||||
# The metric should be an integer, as it's a number of bytes. But in general
|
||||
# all prometheus metrics are floats. So to be pedantic, read it as a float
|
||||
# and round to integer.
|
||||
matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$', all_metrics,
|
||||
re.MULTILINE)
|
||||
matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics)
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
def get_peak_mem(self, pageserver) -> int:
|
||||
"""
|
||||
Fetch the "maxrss" metric from the pageserver
|
||||
"""
|
||||
# Fetch all the exposed prometheus metrics from page server
|
||||
all_metrics = pageserver.http_client().get_metrics()
|
||||
# See comment in get_io_writes()
|
||||
matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics,
|
||||
re.MULTILINE)
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):
|
||||
"""
|
||||
Calculate the on-disk size of a timeline
|
||||
"""
|
||||
path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
|
||||
|
||||
totalbytes = 0
|
||||
for root, dirs, files in os.walk(path):
|
||||
for name in files:
|
||||
totalbytes += os.path.getsize(os.path.join(root, name))
|
||||
|
||||
return totalbytes
|
||||
|
||||
@contextmanager
|
||||
def record_pageserver_writes(self, pageserver, metric_name):
|
||||
"""
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
import logging
|
||||
import logging.config
|
||||
|
||||
"""
|
||||
This file configures logging to use in python tests.
|
||||
Logs are automatically captured and shown in their
|
||||
own section after all tests are executed.
|
||||
|
||||
To see logs for all (even successful) tests, run
|
||||
pytest with the following command:
|
||||
- `pipenv run pytest -n8 -rA`
|
||||
|
||||
Other log config can be set in pytest.ini file.
|
||||
You can add `log_cli = true` to it to watch
|
||||
logs in real time.
|
||||
|
||||
To get more info about logging with pytest, see
|
||||
https://docs.pytest.org/en/6.2.x/logging.html
|
||||
"""
|
||||
|
||||
# this config is only used for default log levels,
|
||||
# log format is specified in pytest.ini file
|
||||
LOGGING = {
|
||||
"version": 1,
|
||||
"loggers": {
|
||||
"root": {
|
||||
"level": "INFO"
|
||||
},
|
||||
"root.wal_acceptor_async": {
|
||||
"level": "INFO" # a lot of logs on DEBUG level
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def getLogger(name='root') -> logging.Logger:
|
||||
"""Method to get logger for tests.
|
||||
|
||||
Should be used to get correctly initialized logger. """
|
||||
return logging.getLogger(name)
|
||||
|
||||
# default logger for tests
|
||||
log = getLogger()
|
||||
|
||||
logging.config.dictConfig(LOGGING)
|
||||
@@ -2,7 +2,7 @@ import os
|
||||
import subprocess
|
||||
|
||||
from typing import Any, List
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
def get_self_dir() -> str:
|
||||
""" Get the path to the directory where this script lives. """
|
||||
@@ -21,7 +21,7 @@ def mkdir_if_needed(path: str) -> None:
|
||||
assert os.path.isdir(path)
|
||||
|
||||
|
||||
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
||||
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:
|
||||
""" Run a process and capture its output
|
||||
|
||||
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
|
||||
@@ -29,7 +29,6 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
||||
counter.
|
||||
|
||||
If those files already exist, we will overwrite them.
|
||||
Returns basepath for files with captured output.
|
||||
"""
|
||||
assert type(cmd) is list
|
||||
base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
|
||||
@@ -39,11 +38,9 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
||||
|
||||
with open(stdout_filename, 'w') as stdout_f:
|
||||
with open(stderr_filename, 'w') as stderr_f:
|
||||
log.info('(capturing output to "{}.stdout")'.format(base))
|
||||
print('(capturing output to "{}.stdout")'.format(base))
|
||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||
|
||||
return basepath
|
||||
|
||||
|
||||
_global_counter = 0
|
||||
|
||||
@@ -58,6 +55,10 @@ def global_counter() -> int:
|
||||
_global_counter += 1
|
||||
return _global_counter
|
||||
|
||||
def lsn_to_hex(num: int) -> str:
|
||||
""" Convert lsn from int to standard hex notation. """
|
||||
return "{:X}/{:X}".format(num >> 32, num & 0xffffffff)
|
||||
def debug_print(*args, **kwargs) -> None:
|
||||
""" Print to the console if TEST_DEBUG_PRINT is set in env.
|
||||
|
||||
All parameters are passed to print().
|
||||
"""
|
||||
if os.environ.get('TEST_DEBUG_PRINT') is not None:
|
||||
print(*args, **kwargs)
|
||||
|
||||
@@ -5,7 +5,6 @@ import os
|
||||
import pathlib
|
||||
import uuid
|
||||
import jwt
|
||||
import json
|
||||
import psycopg2
|
||||
import pytest
|
||||
import shutil
|
||||
@@ -27,8 +26,6 @@ from typing_extensions import Literal
|
||||
import requests
|
||||
|
||||
from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
|
||||
from fixtures.log_helper import log
|
||||
|
||||
"""
|
||||
This file contains pytest fixtures. A fixture is a test resource that can be
|
||||
summoned by placing its name in the test's arguments.
|
||||
@@ -66,16 +63,14 @@ def pytest_configure(config):
|
||||
raise Exception('Too many workers configured. Cannot distrubute ports for services.')
|
||||
|
||||
# does not use -c as it is not supported on macOS
|
||||
cmd = ['pgrep', 'pageserver|postgres|safekeeper']
|
||||
cmd = ['pgrep', 'pageserver|postgres|wal_acceptor']
|
||||
result = subprocess.run(cmd, stdout=subprocess.DEVNULL)
|
||||
if result.returncode == 0:
|
||||
# returncode of 0 means it found something.
|
||||
# This is bad; we don't want any of those processes polluting the
|
||||
# result of the test.
|
||||
# NOTE this shows as an internal pytest error, there might be a better way
|
||||
raise Exception(
|
||||
'Found interfering processes running. Stop all Zenith pageservers, nodes, safekeepers, as well as stand-alone Postgres.'
|
||||
)
|
||||
raise Exception('found interfering processes running')
|
||||
|
||||
|
||||
def determine_scope(fixture_name: str, config: Any) -> str:
|
||||
@@ -190,13 +185,13 @@ class ZenithCli:
|
||||
|
||||
>>> result = zenith_cli.run(...)
|
||||
>>> assert result.stderr == ""
|
||||
>>> log.info(result.stdout)
|
||||
>>> print(result.stdout)
|
||||
"""
|
||||
|
||||
assert type(arguments) == list
|
||||
|
||||
args = [self.bin_zenith] + arguments
|
||||
log.info('Running command "{}"'.format(' '.join(args)))
|
||||
print('Running command "{}"'.format(' '.join(args)))
|
||||
|
||||
# Interceipt CalledProcessError and print more info
|
||||
try:
|
||||
@@ -213,7 +208,7 @@ class ZenithCli:
|
||||
stdout: {exc.stdout}
|
||||
stderr: {exc.stderr}
|
||||
"""
|
||||
log.info(msg)
|
||||
print(msg)
|
||||
|
||||
raise Exception(msg) from exc
|
||||
|
||||
@@ -235,16 +230,16 @@ class ZenithPageserverHttpClient(requests.Session):
|
||||
self.headers['Authorization'] = f'Bearer {auth_token}'
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://127.0.0.1:{self.port}/v1/status").raise_for_status()
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def branch_list(self, tenant_id: uuid.UUID) -> List[Dict]:
|
||||
res = self.get(f"http://127.0.0.1:{self.port}/v1/branch/{tenant_id.hex}")
|
||||
res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}")
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
|
||||
def branch_create(self, tenant_id: uuid.UUID, name: str, start_point: str) -> Dict:
|
||||
res = self.post(
|
||||
f"http://127.0.0.1:{self.port}/v1/branch",
|
||||
f"http://localhost:{self.port}/v1/branch",
|
||||
json={
|
||||
'tenant_id': tenant_id.hex,
|
||||
'name': name,
|
||||
@@ -256,19 +251,19 @@ class ZenithPageserverHttpClient(requests.Session):
|
||||
|
||||
def branch_detail(self, tenant_id: uuid.UUID, name: str) -> Dict:
|
||||
res = self.get(
|
||||
f"http://127.0.0.1:{self.port}/v1/branch/{tenant_id.hex}/{name}",
|
||||
f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}",
|
||||
)
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
|
||||
def tenant_list(self) -> List[str]:
|
||||
res = self.get(f"http://127.0.0.1:{self.port}/v1/tenant")
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant")
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
|
||||
def tenant_create(self, tenant_id: uuid.UUID):
|
||||
res = self.post(
|
||||
f"http://127.0.0.1:{self.port}/v1/tenant",
|
||||
f"http://localhost:{self.port}/v1/tenant",
|
||||
json={
|
||||
'tenant_id': tenant_id.hex,
|
||||
},
|
||||
@@ -277,7 +272,7 @@ class ZenithPageserverHttpClient(requests.Session):
|
||||
return res.json()
|
||||
|
||||
def get_metrics(self) -> str:
|
||||
res = self.get(f"http://127.0.0.1:{self.port}/metrics")
|
||||
res = self.get(f"http://localhost:{self.port}/metrics")
|
||||
res.raise_for_status()
|
||||
return res.text
|
||||
|
||||
@@ -348,7 +343,7 @@ class PageserverPort:
|
||||
class ZenithPageserver(PgProtocol):
|
||||
""" An object representing a running pageserver. """
|
||||
def __init__(self, zenith_cli: ZenithCli, repo_dir: str, port: PageserverPort):
|
||||
super().__init__(host='127.0.0.1', port=port.pg)
|
||||
super().__init__(host='localhost', port=port.pg)
|
||||
self.zenith_cli = zenith_cli
|
||||
self.running = False
|
||||
self.initial_tenant = None
|
||||
@@ -377,7 +372,6 @@ class ZenithPageserver(PgProtocol):
|
||||
Start the page server.
|
||||
Returns self.
|
||||
"""
|
||||
assert self.running == False
|
||||
|
||||
self.zenith_cli.run(['start'])
|
||||
self.running = True
|
||||
@@ -385,18 +379,14 @@ class ZenithPageserver(PgProtocol):
|
||||
self.initial_tenant = self.zenith_cli.run(['tenant', 'list']).stdout.strip()
|
||||
return self
|
||||
|
||||
def stop(self, immediate=False) -> 'ZenithPageserver':
|
||||
def stop(self) -> 'ZenithPageserver':
|
||||
"""
|
||||
Stop the page server.
|
||||
Returns self.
|
||||
"""
|
||||
cmd = ['stop']
|
||||
if immediate:
|
||||
cmd.append('immediate')
|
||||
|
||||
log.info(f"Stopping pageserver with {cmd}")
|
||||
if self.running:
|
||||
self.zenith_cli.run(cmd)
|
||||
self.zenith_cli.run(['stop'])
|
||||
self.running = False
|
||||
|
||||
return self
|
||||
@@ -405,7 +395,7 @@ class ZenithPageserver(PgProtocol):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
self.stop(True)
|
||||
self.stop()
|
||||
|
||||
@cached_property
|
||||
def auth_keys(self) -> AuthKeys:
|
||||
@@ -426,7 +416,7 @@ class ZenithPageserver(PgProtocol):
|
||||
def pageserver_port(port_distributor: PortDistributor) -> PageserverPort:
|
||||
pg = port_distributor.get_port()
|
||||
http = port_distributor.get_port()
|
||||
log.info(f"pageserver_port: pg={pg} http={http}")
|
||||
print(f"pageserver_port: pg={pg} http={http}")
|
||||
return PageserverPort(pg=pg, http=http)
|
||||
|
||||
|
||||
@@ -450,8 +440,8 @@ def pageserver(zenith_cli: ZenithCli, repo_dir: str, pageserver_port: Pageserver
|
||||
yield ps
|
||||
|
||||
# After the yield comes any cleanup code we need.
|
||||
log.info('Starting pageserver cleanup')
|
||||
ps.stop(True)
|
||||
print('Starting pageserver cleanup')
|
||||
ps.stop()
|
||||
|
||||
class PgBin:
|
||||
""" A helper class for executing postgres binaries """
|
||||
@@ -488,26 +478,24 @@ class PgBin:
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
log.info('Running command "{}"'.format(' '.join(command)))
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
subprocess.run(command, env=env, cwd=cwd, check=True)
|
||||
|
||||
def run_capture(self,
|
||||
command: List[str],
|
||||
env: Optional[Env] = None,
|
||||
cwd: Optional[str] = None,
|
||||
**kwargs: Any) -> None:
|
||||
cwd: Optional[str] = None) -> None:
|
||||
"""
|
||||
Run one of the postgres binaries, with stderr and stdout redirected to a file.
|
||||
|
||||
This is just like `run`, but for chatty programs. Returns basepath for files
|
||||
with captured output.
|
||||
This is just like `run`, but for chatty programs.
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
log.info('Running command "{}"'.format(' '.join(command)))
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs)
|
||||
subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True)
|
||||
|
||||
|
||||
@zenfixture
|
||||
@@ -525,7 +513,7 @@ def pageserver_auth_enabled(zenith_cli: ZenithCli, repo_dir: str, pageserver_por
|
||||
class Postgres(PgProtocol):
|
||||
""" An object representing a running postgres daemon. """
|
||||
def __init__(self, zenith_cli: ZenithCli, repo_dir: str, pg_bin: PgBin, tenant_id: str, port: int):
|
||||
super().__init__(host='127.0.0.1', port=port)
|
||||
super().__init__(host='localhost', port=port)
|
||||
|
||||
self.zenith_cli = zenith_cli
|
||||
self.running = False
|
||||
@@ -573,12 +561,12 @@ class Postgres(PgProtocol):
|
||||
|
||||
assert self.branch is not None
|
||||
|
||||
log.info(f"Starting postgres on branch {self.branch}")
|
||||
print(f"Starting postgres on branch {self.branch}")
|
||||
|
||||
run_result = self.zenith_cli.run(['pg', 'start', self.branch, f'--tenantid={self.tenant_id}', f'--port={self.port}'])
|
||||
self.running = True
|
||||
|
||||
log.info(f"stdout: {run_result.stdout}")
|
||||
print(f"stdout: {run_result.stdout}")
|
||||
|
||||
return self
|
||||
|
||||
@@ -793,7 +781,7 @@ def postgres(zenith_cli: ZenithCli, initial_tenant: str, repo_dir: str, pg_bin:
|
||||
yield pgfactory
|
||||
|
||||
# After the yield comes any cleanup code we need.
|
||||
log.info('Starting postgres cleanup')
|
||||
print('Starting postgres cleanup')
|
||||
pgfactory.stop_all()
|
||||
|
||||
def read_pid(path: Path):
|
||||
@@ -801,17 +789,12 @@ def read_pid(path: Path):
|
||||
return int(path.read_text())
|
||||
|
||||
|
||||
@dataclass
|
||||
class WalAcceptorPort:
|
||||
pg: int
|
||||
http: int
|
||||
|
||||
@dataclass
|
||||
class WalAcceptor:
|
||||
""" An object representing a running wal acceptor daemon. """
|
||||
wa_bin_path: Path
|
||||
data_dir: Path
|
||||
port: WalAcceptorPort
|
||||
port: int
|
||||
num: int # identifier for logging
|
||||
pageserver_port: int
|
||||
auth_token: Optional[str] = None
|
||||
@@ -823,36 +806,28 @@ class WalAcceptor:
|
||||
|
||||
cmd = [str(self.wa_bin_path)]
|
||||
cmd.extend(["-D", str(self.data_dir)])
|
||||
cmd.extend(["--listen-pg", f"127.0.0.1:{self.port.pg}"])
|
||||
cmd.extend(["--listen-http", f"127.0.0.1:{self.port.http}"])
|
||||
cmd.extend(["-l", f"localhost:{self.port}"])
|
||||
cmd.append("--daemonize")
|
||||
cmd.append("--no-sync")
|
||||
# Tell page server it can receive WAL from this WAL safekeeper
|
||||
cmd.extend(["--pageserver", f"127.0.0.1:{self.pageserver_port}"])
|
||||
cmd.extend(["--pageserver", f"localhost:{self.pageserver_port}"])
|
||||
cmd.extend(["--recall", "1 second"])
|
||||
log.info('Running command "{}"'.format(' '.join(cmd)))
|
||||
print('Running command "{}"'.format(' '.join(cmd)))
|
||||
env = {'PAGESERVER_AUTH_TOKEN': self.auth_token} if self.auth_token else None
|
||||
subprocess.run(cmd, check=True, env=env)
|
||||
|
||||
# wait for wal acceptor start by checking its status
|
||||
started_at = time.time()
|
||||
while True:
|
||||
try:
|
||||
http_cli = self.http_client()
|
||||
http_cli.check_status()
|
||||
except Exception as e:
|
||||
elapsed = time.time() - started_at
|
||||
if elapsed > 3:
|
||||
raise RuntimeError(
|
||||
f"timed out waiting {elapsed:.0f}s for wal acceptor start: {e}")
|
||||
time.sleep(0.5)
|
||||
else:
|
||||
break # success
|
||||
return self
|
||||
# wait for wal acceptor start by checkking that pid is readable
|
||||
for _ in range(3):
|
||||
pid = self.get_pid()
|
||||
if pid is not None:
|
||||
return self
|
||||
time.sleep(0.5)
|
||||
|
||||
raise RuntimeError("cannot get wal acceptor pid")
|
||||
|
||||
@property
|
||||
def pidfile(self) -> Path:
|
||||
return self.data_dir / "safekeeper.pid"
|
||||
return self.data_dir / "wal_acceptor.pid"
|
||||
|
||||
def get_pid(self) -> Optional[int]:
|
||||
if not self.pidfile.exists():
|
||||
@@ -866,10 +841,10 @@ class WalAcceptor:
|
||||
return pid
|
||||
|
||||
def stop(self) -> 'WalAcceptor':
|
||||
log.info('Stopping wal acceptor {}'.format(self.num))
|
||||
print('Stopping wal acceptor {}'.format(self.num))
|
||||
pid = self.get_pid()
|
||||
if pid is None:
|
||||
log.info("Wal acceptor {} is not running".format(self.num))
|
||||
print("Wal acceptor {} is not running".format(self.num))
|
||||
return self
|
||||
|
||||
try:
|
||||
@@ -879,36 +854,11 @@ class WalAcceptor:
|
||||
pass # pidfile might be obsolete
|
||||
return self
|
||||
|
||||
def append_logical_message(self, tenant_id: str, timeline_id: str, request: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Send JSON_CTRL query to append LogicalMessage to WAL and modify
|
||||
safekeeper state. It will construct LogicalMessage from provided
|
||||
prefix and message, and then will write it to WAL.
|
||||
"""
|
||||
|
||||
# "replication=0" hacks psycopg not to send additional queries
|
||||
# on startup, see https://github.com/psycopg/psycopg2/pull/482
|
||||
connstr = f"host=127.0.0.1 port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'"
|
||||
|
||||
with closing(psycopg2.connect(connstr)) as conn:
|
||||
# server doesn't support transactions
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
request_json = json.dumps(request)
|
||||
log.info(f"JSON_CTRL request on port {self.port.pg}: {request_json}")
|
||||
cur.execute("JSON_CTRL " + request_json)
|
||||
all = cur.fetchall()
|
||||
log.info(f"JSON_CTRL response: {all[0][0]}")
|
||||
return json.loads(all[0][0])
|
||||
|
||||
def http_client(self):
|
||||
return WalAcceptorHttpClient(port=self.port.http)
|
||||
|
||||
|
||||
class WalAcceptorFactory:
|
||||
""" An object representing multiple running wal acceptors. """
|
||||
def __init__(self, zenith_binpath: Path, data_dir: Path, pageserver_port: int, port_distributor: PortDistributor):
|
||||
self.wa_bin_path = zenith_binpath / 'safekeeper'
|
||||
self.wa_bin_path = zenith_binpath / 'wal_acceptor'
|
||||
self.data_dir = data_dir
|
||||
self.instances: List[WalAcceptor] = []
|
||||
self.port_distributor = port_distributor
|
||||
@@ -922,10 +872,7 @@ class WalAcceptorFactory:
|
||||
wa = WalAcceptor(
|
||||
wa_bin_path=self.wa_bin_path,
|
||||
data_dir=self.data_dir / "wal_acceptor_{}".format(wa_num),
|
||||
port=WalAcceptorPort(
|
||||
pg=self.port_distributor.get_port(),
|
||||
http=self.port_distributor.get_port(),
|
||||
),
|
||||
port=self.port_distributor.get_port(),
|
||||
num=wa_num,
|
||||
pageserver_port=self.pageserver_port,
|
||||
auth_token=auth_token,
|
||||
@@ -949,7 +896,7 @@ class WalAcceptorFactory:
|
||||
|
||||
def get_connstrs(self) -> str:
|
||||
""" Get list of wal acceptor endpoints suitable for wal_acceptors GUC """
|
||||
return ','.join(["127.0.0.1:{}".format(wa.port.pg) for wa in self.instances])
|
||||
return ','.join(["localhost:{}".format(wa.port) for wa in self.instances])
|
||||
|
||||
|
||||
@zenfixture
|
||||
@@ -963,34 +910,16 @@ def wa_factory(zenith_binpath: str, repo_dir: str, pageserver_port: PageserverPo
|
||||
)
|
||||
yield wafactory
|
||||
# After the yield comes any cleanup code we need.
|
||||
log.info('Starting wal acceptors cleanup')
|
||||
print('Starting wal acceptors cleanup')
|
||||
wafactory.stop_all()
|
||||
|
||||
@dataclass
|
||||
class PageserverTimelineStatus:
|
||||
acceptor_epoch: int
|
||||
|
||||
class WalAcceptorHttpClient(requests.Session):
|
||||
def __init__(self, port: int) -> None:
|
||||
super().__init__()
|
||||
self.port = port
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://127.0.0.1:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def timeline_status(self, tenant_id: str, timeline_id: str) -> PageserverTimelineStatus:
|
||||
res = self.get(f"http://127.0.0.1:{self.port}/v1/timeline/{tenant_id}/{timeline_id}")
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
return PageserverTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'])
|
||||
|
||||
|
||||
@zenfixture
|
||||
def base_dir() -> str:
|
||||
""" find the base directory (currently this is the git root) """
|
||||
|
||||
base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..'))
|
||||
log.info(f'base_dir is {base_dir}')
|
||||
print('\nbase_dir is', base_dir)
|
||||
return base_dir
|
||||
|
||||
|
||||
@@ -1019,7 +948,7 @@ def test_output_dir(request: Any, top_output_dir: str) -> str:
|
||||
test_name = 'shared'
|
||||
|
||||
test_output_dir = os.path.join(top_output_dir, test_name)
|
||||
log.info(f'test_output_dir is {test_output_dir}')
|
||||
print('test_output_dir is', test_output_dir)
|
||||
shutil.rmtree(test_output_dir, ignore_errors=True)
|
||||
mkdir_if_needed(test_output_dir)
|
||||
return test_output_dir
|
||||
@@ -1061,7 +990,7 @@ def pg_distrib_dir(base_dir: str) -> str:
|
||||
pg_dir = env_postgres_bin
|
||||
else:
|
||||
pg_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR))
|
||||
log.info(f'postgres dir is {pg_dir}')
|
||||
print('postgres dir is', pg_dir)
|
||||
if not os.path.exists(os.path.join(pg_dir, 'bin/postgres')):
|
||||
raise Exception('postgres not found at "{}"'.format(pg_dir))
|
||||
return pg_dir
|
||||
@@ -1100,7 +1029,7 @@ def list_files_to_compare(pgdata_dir: str):
|
||||
pgdata_files.append(rel_file)
|
||||
|
||||
pgdata_files.sort()
|
||||
log.info(pgdata_files)
|
||||
print(pgdata_files)
|
||||
return pgdata_files
|
||||
|
||||
# pg is the existing and running compute node, that we want to compare with a basebackup
|
||||
@@ -1124,7 +1053,7 @@ def check_restored_datadir_content(zenith_cli: ZenithCli, test_output_dir: str,
|
||||
cmd = rf"""
|
||||
{psql_path} \
|
||||
--no-psqlrc \
|
||||
postgres://127.0.0.1:{pageserver_pg_port} \
|
||||
postgres://localhost:{pageserver_pg_port} \
|
||||
-c 'basebackup {pg.tenant_id} {timeline}' \
|
||||
| tar -x -C {restored_dir_path}
|
||||
"""
|
||||
@@ -1146,7 +1075,9 @@ def check_restored_datadir_content(zenith_cli: ZenithCli, test_output_dir: str,
|
||||
restored_dir_path,
|
||||
pgdata_files,
|
||||
shallow=False)
|
||||
log.info(f'filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}')
|
||||
print('filecmp result mismatch and error lists:')
|
||||
print(mismatch)
|
||||
print(error)
|
||||
|
||||
for f in mismatch:
|
||||
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
while true; do
|
||||
echo -n "==== CURRENT TIME:" >> /tmp/test_output/netstat.stdout
|
||||
date +"%T.%N" >> /tmp/test_output/netstat.stdout
|
||||
sudo netstat -vpnoa | grep tcp | sort >> /tmp/test_output/netstat.stdout
|
||||
sleep 0.5
|
||||
done
|
||||
@@ -1,10 +1,22 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
|
||||
path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
|
||||
|
||||
totalbytes = 0
|
||||
for root, dirs, files in os.walk(path):
|
||||
for name in files:
|
||||
totalbytes += os.path.getsize(os.path.join(root, name))
|
||||
|
||||
if 'wal' in dirs:
|
||||
dirs.remove('wal') # don't visit 'wal' subdirectory
|
||||
|
||||
return totalbytes
|
||||
|
||||
#
|
||||
# Run bulk INSERT test.
|
||||
#
|
||||
@@ -13,14 +25,13 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
# 1. Time to INSERT 5 million rows
|
||||
# 2. Disk writes
|
||||
# 3. Disk space used
|
||||
# 4. Peak memory usage
|
||||
#
|
||||
def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_bulk_insert", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_bulk_insert')
|
||||
log.info("postgres is running on 'test_bulk_insert' branch")
|
||||
print("postgres is running on 'test_bulk_insert' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
@@ -44,9 +55,6 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg
|
||||
# time and I/O
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
# Record peak memory usage
|
||||
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
import timeit
|
||||
import pytest
|
||||
|
||||
from fixtures.zenith_fixtures import (
|
||||
TenantFactory,
|
||||
ZenithCli,
|
||||
PostgresFactory,
|
||||
)
|
||||
|
||||
pytest_plugins = ("fixtures.benchmark_fixture")
|
||||
|
||||
# Run bulk tenant creation test.
|
||||
#
|
||||
# Collects metrics:
|
||||
#
|
||||
# 1. Time to create {1,10,50} tenants
|
||||
# 2. Average creation time per tenant
|
||||
|
||||
|
||||
@pytest.mark.parametrize('tenants_count', [1, 5, 10])
|
||||
@pytest.mark.parametrize('use_wal_acceptors', ['with_wa', 'without_wa'])
|
||||
def test_bulk_tenant_create(
|
||||
zenith_cli: ZenithCli,
|
||||
tenant_factory: TenantFactory,
|
||||
postgres: PostgresFactory,
|
||||
wa_factory,
|
||||
use_wal_acceptors: str,
|
||||
tenants_count: int,
|
||||
zenbenchmark,
|
||||
):
|
||||
"""Measure tenant creation time (with and without wal acceptors)"""
|
||||
|
||||
time_slices = []
|
||||
|
||||
for i in range(tenants_count):
|
||||
start = timeit.default_timer()
|
||||
|
||||
tenant = tenant_factory.create()
|
||||
zenith_cli.run([
|
||||
"branch", f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", "main",
|
||||
f"--tenantid={tenant}"
|
||||
])
|
||||
|
||||
if use_wal_acceptors == 'with_wa':
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
pg_tenant = postgres.create_start(
|
||||
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
|
||||
tenant,
|
||||
wal_acceptors=wa_factory.get_connstrs() if use_wal_acceptors == 'with_wa' else None,
|
||||
)
|
||||
|
||||
end = timeit.default_timer()
|
||||
time_slices.append(end - start)
|
||||
|
||||
pg_tenant.stop()
|
||||
|
||||
zenbenchmark.record('tenant_creation_time', sum(time_slices) / len(time_slices), 's')
|
||||
@@ -1,49 +0,0 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
#
|
||||
# Test buffering GisT build. It WAL-logs the whole relation, in 32-page chunks.
|
||||
# As of this writing, we're duplicate those giant WAL records for each page,
|
||||
# which makes the delta layer about 32x larger than it needs to be.
|
||||
#
|
||||
def test_gist_buffering_build(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_gist_buffering_build", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_gist_buffering_build')
|
||||
log.info("postgres is running on 'test_gist_buffering_build' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = pageserver.connect();
|
||||
pscur = psconn.cursor()
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
# Create test table.
|
||||
cur.execute("create table gist_point_tbl(id int4, p point)");
|
||||
cur.execute("insert into gist_point_tbl select g, point(g, g) from generate_series(1, 1000000) g;");
|
||||
|
||||
# Build the index.
|
||||
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
|
||||
with zenbenchmark.record_duration('build'):
|
||||
cur.execute("create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)");
|
||||
|
||||
# Flush the layers from memory to disk. This is included in the reported
|
||||
# time and I/O
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 1000000")
|
||||
|
||||
# Record peak memory usage
|
||||
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
|
||||
@@ -1,10 +1,22 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
|
||||
path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
|
||||
|
||||
totalbytes = 0
|
||||
for root, dirs, files in os.walk(path):
|
||||
for name in files:
|
||||
totalbytes += os.path.getsize(os.path.join(root, name))
|
||||
|
||||
if 'wal' in dirs:
|
||||
dirs.remove('wal') # don't visit 'wal' subdirectory
|
||||
|
||||
return totalbytes
|
||||
|
||||
#
|
||||
# Run a very short pgbench test.
|
||||
#
|
||||
@@ -19,7 +31,7 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin
|
||||
zenith_cli.run(["branch", "test_pgbench_perf", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_pgbench_perf')
|
||||
log.info("postgres is running on 'test_pgbench_perf' branch")
|
||||
print("postgres is running on 'test_pgbench_perf' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
@@ -52,5 +64,5 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
|
||||
|
||||
@@ -1,75 +0,0 @@
|
||||
# Demonstrate Write Amplification with naive oldest-first layer checkpointing
|
||||
# algorithm.
|
||||
#
|
||||
# In each iteration of the test, we create a new table that's slightly under 10
|
||||
# MB in size (10 MB is the current "segment size" used by the page server). Then
|
||||
# we make a tiny update to all the tables already created. This creates a WAL
|
||||
# pattern where you have a lot of updates on one segment (the newly created
|
||||
# one), alternating with a small updates on all relations. This is the worst
|
||||
# case scenario for the naive checkpointing policy where we write out the layers
|
||||
# in LSN order, writing the oldest layer first. That creates a new 10 MB image
|
||||
# layer to be created for each of those small updates. This is the Write
|
||||
# Amplification problem at its finest.
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
def test_write_amplification(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_write_amplification", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_write_amplification')
|
||||
log.info("postgres is running on 'test_write_amplification' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = pageserver.connect();
|
||||
pscur = psconn.cursor()
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
|
||||
with zenbenchmark.record_duration('run'):
|
||||
|
||||
# NOTE: Because each iteration updates every table already created,
|
||||
# the runtime and write amplification is O(n^2), where n is the
|
||||
# number of iterations.
|
||||
for i in range(25):
|
||||
cur.execute(f'''
|
||||
CREATE TABLE tbl{i} AS
|
||||
SELECT g as i, 'long string to consume some space' || g as t
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
cur.execute(f"create index on tbl{i} (i);")
|
||||
for j in range(1, i):
|
||||
cur.execute(f"delete from tbl{j} where i = {i}")
|
||||
|
||||
# Force checkpointing. As of this writing, we don't have
|
||||
# a back-pressure mechanism, and the page server cannot
|
||||
# keep up digesting and checkpointing the WAL at the
|
||||
# rate that it is generated. If we don't force a
|
||||
# checkpoint, the WAL will just accumulate in memory
|
||||
# until you hit OOM error. So in effect, we use much
|
||||
# more memory to hold the incoming WAL, and write them
|
||||
# out in larger batches than we'd really want. Using
|
||||
# more memory hides the write amplification problem this
|
||||
# test tries to demonstrate.
|
||||
#
|
||||
# The write amplification problem is real, and using
|
||||
# more memory isn't the right solution. We could
|
||||
# demonstrate the effect also by generating the WAL
|
||||
# slower, adding some delays in this loop. But forcing
|
||||
# the the checkpointing and GC makes the test go faster,
|
||||
# with the same total I/O effect.
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
|
||||
@@ -1,4 +1,2 @@
|
||||
[pytest]
|
||||
minversion = 6.0
|
||||
log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
|
||||
log_date_format = %Y-%m-%d %H:%M:%S
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
import pytest
|
||||
import os
|
||||
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
"""
|
||||
Use this test to see what happens when tests fail.
|
||||
@@ -24,7 +22,7 @@ def test_broken(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_broken", "empty"])
|
||||
|
||||
postgres.create_start("test_broken")
|
||||
log.info('postgres is running')
|
||||
print('postgres is running')
|
||||
|
||||
log.info('THIS NEXT COMMAND WILL FAIL:')
|
||||
print('THIS NEXT COMMAND WILL FAIL:')
|
||||
pg_bin.run('pgbench -i_am_a_broken_test'.split())
|
||||
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: 8e82cb4a36...607255fb7a
@@ -11,11 +11,8 @@ regex = "1.4.5"
|
||||
bincode = "1.3"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
hyper = "0.14"
|
||||
routerify = "2"
|
||||
fs2 = "0.4.3"
|
||||
lazy_static = "1.4.0"
|
||||
serde_json = "1"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
daemonize = "0.4.1"
|
||||
@@ -30,11 +27,9 @@ humantime = "2.1.0"
|
||||
walkdir = "2"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
hex = "0.4.3"
|
||||
const_format = "0.2.21"
|
||||
|
||||
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
|
||||
pageserver = { path = "../pageserver" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
@@ -76,43 +76,6 @@ safekeepers.
|
||||
See README_PROTO.md for a more detailed desription of the consensus
|
||||
protocol. spec/ contains TLA+ specification of it.
|
||||
|
||||
# Q&A
|
||||
|
||||
Q: Why have a separate service instead of connecting Page Server directly to a
|
||||
primary PostgreSQL node?
|
||||
A: Page Server is a single server which can be lost. As our primary
|
||||
fault-tolerant storage is S3, we do not want to wait for it before
|
||||
committing a transaction. The WAL service acts as a temporary fault-tolerant
|
||||
storage for recent data before it gets to the Page Server and then finally
|
||||
to S3. Whenever WALs and pages are committed to S3, WAL's storage can be
|
||||
trimmed.
|
||||
|
||||
Q: What if the compute node evicts a page, needs it back, but the page is yet
|
||||
to reach the Page Server?
|
||||
A: If the compute node has evicted a page, changes to it have been WAL-logged
|
||||
(that's why it is called Write Ahead logging; there are some exceptions like
|
||||
index builds, but these are exceptions). These WAL records will eventually
|
||||
reach the Page Server. The Page Server notes that the compute note requests
|
||||
pages with a very recent LSN and will not respond to the compute node until a
|
||||
corresponding WAL is received from WAL safekeepers.
|
||||
|
||||
Q: How long may Page Server wait for?
|
||||
A: Not too long, hopefully. If a page is evicted, it probably was not used for
|
||||
a while, so the WAL service have had enough time to push changes to the Page
|
||||
Server. There may be issues if there is no backpressure and compute node with
|
||||
WAL service run ahead of Page Server, though.
|
||||
There is no backpressure right now, so you may even see some spurious
|
||||
timeouts in tests.
|
||||
|
||||
Q: How do WAL safekeepers communicate with each other?
|
||||
A: They may only send each other messages via the compute node, they never
|
||||
communicate directly with each other.
|
||||
|
||||
Q: Why have a consensus algorithm if there is only a single compute node?
|
||||
A: Actually there may be moments with multiple PostgreSQL nodes running at the
|
||||
same time. E.g. we are bringing one up and one down. We would like to avoid
|
||||
simultaneous writes from different nodes, so there should be a consensus on
|
||||
who is the primary node.
|
||||
|
||||
# Terminology
|
||||
|
||||
|
||||
@@ -1,48 +1,35 @@
|
||||
//
|
||||
// Main entry point for the safekeeper executable
|
||||
// Main entry point for the wal_acceptor executable
|
||||
//
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
use const_format::formatcp;
|
||||
use daemonize::Daemonize;
|
||||
use log::*;
|
||||
use std::env;
|
||||
use std::net::TcpListener;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::logging;
|
||||
|
||||
use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR};
|
||||
use walkeeper::http;
|
||||
use walkeeper::s3_offload;
|
||||
use walkeeper::wal_service;
|
||||
use walkeeper::WalAcceptorConf;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("safekeeper");
|
||||
let arg_matches = App::new("Zenith safekeeper")
|
||||
let arg_matches = App::new("Zenith wal_acceptor")
|
||||
.about("Store WAL stream to local file system and push it to WAL receivers")
|
||||
.arg(
|
||||
Arg::with_name("datadir")
|
||||
.short("D")
|
||||
.long("dir")
|
||||
.takes_value(true)
|
||||
.help("Path to the safekeeper data directory"),
|
||||
.help("Path to the WAL acceptor data directory"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen-pg")
|
||||
Arg::with_name("listen")
|
||||
.short("l")
|
||||
.long("listen-pg")
|
||||
.alias("listen") // for compatibility
|
||||
.long("listen")
|
||||
.takes_value(true)
|
||||
.help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen-http")
|
||||
.long("listen-http")
|
||||
.takes_value(true)
|
||||
.help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
|
||||
.help("listen for incoming connections on ip:port (default: 127.0.0.1:5454)"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("pageserver")
|
||||
@@ -83,8 +70,7 @@ fn main() -> Result<()> {
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
pageserver_addr: None,
|
||||
listen_pg_addr: DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
listen_addr: "localhost:5454".to_string(),
|
||||
ttl: None,
|
||||
recall_period: None,
|
||||
pageserver_auth_token: env::var("PAGESERVER_AUTH_TOKEN").ok(),
|
||||
@@ -105,12 +91,8 @@ fn main() -> Result<()> {
|
||||
conf.daemonize = true;
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen-pg") {
|
||||
conf.listen_pg_addr = addr.to_owned();
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen-http") {
|
||||
conf.listen_http_addr = addr.to_owned();
|
||||
if let Some(addr) = arg_matches.value_of("listen") {
|
||||
conf.listen_addr = addr.to_owned();
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("pageserver") {
|
||||
@@ -129,19 +111,8 @@ fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
let log_filename = conf.data_dir.join("safekeeper.log");
|
||||
let log_file = logging::init(log_filename, conf.daemonize)?;
|
||||
|
||||
let http_listener = TcpListener::bind(conf.listen_http_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
info!("Starting safekeeper on {}", conf.listen_pg_addr);
|
||||
let pg_listener = TcpListener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
|
||||
e
|
||||
})?;
|
||||
let log_filename = conf.data_dir.join("wal_acceptor.log");
|
||||
let (_scope_guard, log_file) = logging::init(log_filename, conf.daemonize)?;
|
||||
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
@@ -152,7 +123,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
let stderr = log_file;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file("safekeeper.pid")
|
||||
.pid_file("wal_acceptor.pid")
|
||||
.working_directory(Path::new("."))
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
@@ -165,17 +136,6 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
|
||||
let mut threads = Vec::new();
|
||||
|
||||
let conf_cloned = conf.clone();
|
||||
let http_endpoint_thread = thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(|| {
|
||||
// TODO authentication
|
||||
let router = http::make_router(conf_cloned);
|
||||
endpoint::serve_thread_main(router, http_listener).unwrap();
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(http_endpoint_thread);
|
||||
|
||||
if conf.ttl.is_some() {
|
||||
let s3_conf = conf.clone();
|
||||
let s3_offload_thread = thread::Builder::new()
|
||||
@@ -192,7 +152,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
.name("WAL acceptor thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
let thread_result = wal_service::thread_main(conf, pg_listener);
|
||||
let thread_result = wal_service::thread_main(conf);
|
||||
if let Err(e) = thread_result {
|
||||
info!("wal_service thread terminated: {}", e);
|
||||
}
|
||||
@@ -1,2 +0,0 @@
|
||||
pub mod routes;
|
||||
pub use routes::make_router;
|
||||
@@ -1,88 +0,0 @@
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::RouterBuilder;
|
||||
use serde::Serialize;
|
||||
use serde::Serializer;
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::safekeeper::AcceptorState;
|
||||
use crate::timeline::CreateControlFile;
|
||||
use crate::timeline::GlobalTimelines;
|
||||
use crate::WalAcceptorConf;
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::http::error::ApiError;
|
||||
use zenith_utils::http::json::json_response;
|
||||
use zenith_utils::http::request::parse_request_param;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
/// Healthcheck handler.
|
||||
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
Ok(json_response(StatusCode::OK, "")?)
|
||||
}
|
||||
|
||||
fn get_conf(request: &Request<Body>) -> &WalAcceptorConf {
|
||||
request
|
||||
.data::<Arc<WalAcceptorConf>>()
|
||||
.expect("unknown state type")
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
F: Display,
|
||||
{
|
||||
s.serialize_str(&format!("{}", z))
|
||||
}
|
||||
|
||||
/// Info about timeline on safekeeper ready for reporting.
|
||||
#[derive(Debug, Serialize)]
|
||||
struct TimelineStatus {
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
tenant_id: ZTenantId,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
timeline_id: ZTimelineId,
|
||||
acceptor_state: AcceptorState,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
commit_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
truncate_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Report info about timeline.
|
||||
async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let tli = GlobalTimelines::get(
|
||||
get_conf(&request),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
CreateControlFile::False,
|
||||
)
|
||||
.map_err(ApiError::from_err)?;
|
||||
let sk_state = tli.get_info();
|
||||
|
||||
let status = TimelineStatus {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
acceptor_state: sk_state.acceptor_state,
|
||||
commit_lsn: sk_state.commit_lsn,
|
||||
truncate_lsn: sk_state.truncate_lsn,
|
||||
};
|
||||
Ok(json_response(StatusCode::OK, status)?)
|
||||
}
|
||||
|
||||
/// Safekeeper http router.
|
||||
pub fn make_router(conf: WalAcceptorConf) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let router = endpoint::make_router();
|
||||
router
|
||||
.data(Arc::new(conf))
|
||||
.get("/v1/status", status_handler)
|
||||
.get(
|
||||
"/v1/timeline/:tenant_id/:timeline_id",
|
||||
timeline_status_handler,
|
||||
)
|
||||
}
|
||||
@@ -1,249 +0,0 @@
|
||||
//!
|
||||
//! This module implements JSON_CTRL protocol, which allows exchange
|
||||
//! JSON messages over psql for testing purposes.
|
||||
//!
|
||||
//! Currently supports AppendLogicalMessage, which is used for WAL
|
||||
//! modifications in tests.
|
||||
//!
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use crc32c::crc32c_append;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::safekeeper::{AcceptorProposerMessage, AppendResponse};
|
||||
use crate::safekeeper::{
|
||||
AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerGreeting,
|
||||
};
|
||||
use crate::safekeeper::{SafeKeeperState, Term};
|
||||
use crate::send_wal::SendWalHandler;
|
||||
use crate::timeline::TimelineTools;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils;
|
||||
use postgres_ffi::{uint32, uint64, Oid, XLogRecord};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::pq_proto::{BeMessage, RowDescriptor, TEXT_OID};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct AppendLogicalMessage {
|
||||
// prefix and message to build LogicalMessage
|
||||
lm_prefix: String,
|
||||
lm_message: String,
|
||||
|
||||
// if true, commit_lsn will match flush_lsn after append
|
||||
set_commit_lsn: bool,
|
||||
|
||||
// fields from AppendRequestHeader
|
||||
term: Term,
|
||||
epoch_start_lsn: Lsn,
|
||||
begin_lsn: Lsn,
|
||||
truncate_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct AppendResult {
|
||||
// safekeeper state after append
|
||||
state: SafeKeeperState,
|
||||
// info about new record in the WAL
|
||||
inserted_wal: InsertedWAL,
|
||||
}
|
||||
|
||||
/// Handles command to craft logical message WAL record with given
|
||||
/// content, and then append it with specified term and lsn. This
|
||||
/// function is used to test safekeepers in different scenarios.
|
||||
pub fn handle_json_ctrl(
|
||||
swh: &mut SendWalHandler,
|
||||
pgb: &mut PostgresBackend,
|
||||
cmd: &Bytes,
|
||||
) -> Result<()> {
|
||||
let cmd = cmd
|
||||
.strip_prefix(b"JSON_CTRL")
|
||||
.ok_or_else(|| anyhow!("invalid prefix"))?;
|
||||
// trim zeroes in the end
|
||||
let cmd = cmd.strip_suffix(&[0u8]).unwrap_or(cmd);
|
||||
|
||||
let append_request: AppendLogicalMessage = serde_json::from_slice(cmd)?;
|
||||
info!("JSON_CTRL request: {:?}", append_request);
|
||||
|
||||
// need to init safekeeper state before AppendRequest
|
||||
prepare_safekeeper(swh)?;
|
||||
|
||||
let inserted_wal = append_logical_message(swh, append_request)?;
|
||||
let response = AppendResult {
|
||||
state: swh.timeline.get().get_info(),
|
||||
inserted_wal,
|
||||
};
|
||||
let response_data = serde_json::to_vec(&response)?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor {
|
||||
name: b"json",
|
||||
typoid: TEXT_OID,
|
||||
typlen: -1,
|
||||
..Default::default()
|
||||
}]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(&response_data)]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"JSON_CTRL"))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prepare safekeeper to process append requests without crashes,
|
||||
/// by sending ProposerGreeting with default server.wal_seg_size.
|
||||
fn prepare_safekeeper(swh: &mut SendWalHandler) -> Result<()> {
|
||||
let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting {
|
||||
protocol_version: 1, // current protocol
|
||||
pg_version: 0, // unknown
|
||||
proposer_id: [0u8; 16],
|
||||
system_id: 0,
|
||||
ztli: swh.timelineid.unwrap(),
|
||||
tenant_id: swh.tenantid.unwrap(),
|
||||
tli: 0,
|
||||
wal_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32, // 16MB, default for tests
|
||||
});
|
||||
|
||||
let response = swh.timeline.get().process_msg(&greeting_request)?;
|
||||
match response {
|
||||
AcceptorProposerMessage::Greeting(_) => Ok(()),
|
||||
_ => anyhow::bail!("not GreetingResponse"),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct InsertedWAL {
|
||||
begin_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
append_response: AppendResponse,
|
||||
}
|
||||
|
||||
/// Extend local WAL with new LogicalMessage record. To do that,
|
||||
/// create AppendRequest with new WAL and pass it to safekeeper.
|
||||
fn append_logical_message(
|
||||
swh: &mut SendWalHandler,
|
||||
msg: AppendLogicalMessage,
|
||||
) -> Result<InsertedWAL> {
|
||||
let wal_data = encode_logical_message(msg.lm_prefix, msg.lm_message);
|
||||
let sk_state = swh.timeline.get().get_info();
|
||||
|
||||
let begin_lsn = msg.begin_lsn;
|
||||
let end_lsn = begin_lsn + wal_data.len() as u64;
|
||||
|
||||
let commit_lsn = if msg.set_commit_lsn {
|
||||
end_lsn
|
||||
} else {
|
||||
sk_state.commit_lsn
|
||||
};
|
||||
|
||||
let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
term: msg.term,
|
||||
epoch_start_lsn: begin_lsn,
|
||||
begin_lsn,
|
||||
end_lsn,
|
||||
commit_lsn,
|
||||
truncate_lsn: msg.truncate_lsn,
|
||||
proposer_uuid: [0u8; 16],
|
||||
},
|
||||
wal_data: Bytes::from(wal_data),
|
||||
});
|
||||
|
||||
let response = swh.timeline.get().process_msg(&append_request)?;
|
||||
|
||||
let append_response = match response {
|
||||
AcceptorProposerMessage::AppendResponse(resp) => resp,
|
||||
_ => anyhow::bail!("not AppendResponse"),
|
||||
};
|
||||
|
||||
Ok(InsertedWAL {
|
||||
begin_lsn,
|
||||
end_lsn,
|
||||
append_response,
|
||||
})
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
struct XlLogicalMessage {
|
||||
db_id: Oid,
|
||||
transactional: uint32, // bool, takes 4 bytes due to alignment in C structures
|
||||
prefix_size: uint64,
|
||||
message_size: uint64,
|
||||
}
|
||||
|
||||
impl XlLogicalMessage {
|
||||
pub fn encode(&self) -> Bytes {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
self.ser().unwrap().into()
|
||||
}
|
||||
}
|
||||
|
||||
/// Create new WAL record for non-transactional logical message.
|
||||
/// Used for creating artificial WAL for tests, as LogicalMessage
|
||||
/// record is basically no-op.
|
||||
fn encode_logical_message(prefix: String, message: String) -> Vec<u8> {
|
||||
let mut prefix_bytes = BytesMut::with_capacity(prefix.len() + 1);
|
||||
prefix_bytes.put(prefix.as_bytes());
|
||||
prefix_bytes.put_u8(0);
|
||||
|
||||
let message_bytes = message.as_bytes();
|
||||
|
||||
let logical_message = XlLogicalMessage {
|
||||
db_id: 0,
|
||||
transactional: 0,
|
||||
prefix_size: prefix_bytes.len() as u64,
|
||||
message_size: message_bytes.len() as u64,
|
||||
};
|
||||
|
||||
let mainrdata = logical_message.encode();
|
||||
let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len();
|
||||
// only short mainrdata is supported for now
|
||||
assert!(mainrdata_len <= 255);
|
||||
let mainrdata_len = mainrdata_len as u8;
|
||||
|
||||
let mut data: Vec<u8> = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len];
|
||||
data.extend_from_slice(&mainrdata);
|
||||
data.extend_from_slice(&prefix_bytes);
|
||||
data.extend_from_slice(message_bytes);
|
||||
|
||||
let total_len = xlog_utils::XLOG_SIZE_OF_XLOG_RECORD + data.len();
|
||||
|
||||
let mut header = XLogRecord {
|
||||
xl_tot_len: total_len as u32,
|
||||
xl_xid: 0,
|
||||
xl_prev: 0,
|
||||
xl_info: 0,
|
||||
xl_rmid: 21,
|
||||
__bindgen_padding_0: [0u8; 2usize],
|
||||
xl_crc: 0, // crc will be calculated later
|
||||
};
|
||||
|
||||
let header_bytes = header.encode();
|
||||
let crc = crc32c_append(0, &data);
|
||||
let crc = crc32c_append(crc, &header_bytes[0..xlog_utils::XLOG_RECORD_CRC_OFFS]);
|
||||
header.xl_crc = crc;
|
||||
|
||||
let mut wal: Vec<u8> = Vec::new();
|
||||
wal.extend_from_slice(&header.encode());
|
||||
wal.extend_from_slice(&data);
|
||||
|
||||
// WAL start position must be aligned at 8 bytes,
|
||||
// this will add padding for the next WAL record.
|
||||
const PADDING: usize = 8;
|
||||
let padding_rem = wal.len() % PADDING;
|
||||
if padding_rem != 0 {
|
||||
wal.resize(wal.len() + PADDING - padding_rem, 0);
|
||||
}
|
||||
|
||||
wal
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_logical_message() {
|
||||
let expected = [
|
||||
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
|
||||
105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
||||
];
|
||||
let actual = encode_logical_message("prefix".to_string(), "message".to_string());
|
||||
assert_eq!(expected, actual[..]);
|
||||
}
|
||||
@@ -2,8 +2,6 @@
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod http;
|
||||
pub mod json_ctrl;
|
||||
pub mod receive_wal;
|
||||
pub mod replication;
|
||||
pub mod s3_offload;
|
||||
@@ -12,23 +10,12 @@ pub mod send_wal;
|
||||
pub mod timeline;
|
||||
pub mod wal_service;
|
||||
|
||||
pub mod defaults {
|
||||
use const_format::formatcp;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WalAcceptorConf {
|
||||
pub data_dir: PathBuf,
|
||||
pub daemonize: bool,
|
||||
pub no_sync: bool,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub listen_addr: String,
|
||||
pub pageserver_addr: Option<String>,
|
||||
// TODO (create issue) this is temporary, until protocol between PG<->SK<->PS rework
|
||||
pub pageserver_auth_token: Option<String>,
|
||||
|
||||
@@ -42,7 +42,7 @@ fn request_callback(conf: WalAcceptorConf, timelineid: ZTimelineId, tenantid: ZT
|
||||
);
|
||||
|
||||
// use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses
|
||||
let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_pg_addr);
|
||||
let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_addr);
|
||||
let me_conf: Config = me_connstr.parse().unwrap();
|
||||
let (host, port) = connection_host_port(&me_conf);
|
||||
let callme = format!(
|
||||
|
||||
@@ -15,11 +15,8 @@ use std::cmp::min;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::replication::HotStandbyFeedback;
|
||||
use postgres_ffi::xlog_utils::MAX_SEND_SIZE;
|
||||
use zenith_metrics::{register_gauge_vec, Gauge, GaugeVec};
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::pq_proto::SystemId;
|
||||
@@ -31,7 +28,7 @@ const SK_PROTOCOL_VERSION: u32 = 1;
|
||||
const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
|
||||
/// Consensus logical timestamp.
|
||||
pub type Term = u64;
|
||||
type Term = u64;
|
||||
|
||||
/// Unique id of proposer. Not needed for correctness, used for monitoring.
|
||||
type PgUuid = [u8; 16];
|
||||
@@ -114,7 +111,7 @@ impl Default for SafeKeeperState {
|
||||
// protocol messages
|
||||
|
||||
/// Initial Proposer -> Acceptor message
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ProposerGreeting {
|
||||
/// proposer-acceptor protocol version
|
||||
pub protocol_version: u32,
|
||||
@@ -131,19 +128,19 @@ pub struct ProposerGreeting {
|
||||
|
||||
/// Acceptor -> Proposer initial response: the highest term known to me
|
||||
/// (acceptor voted for).
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct AcceptorGreeting {
|
||||
term: u64,
|
||||
}
|
||||
|
||||
/// Vote request sent from proposer to safekeepers
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct VoteRequest {
|
||||
term: Term,
|
||||
}
|
||||
|
||||
/// Vote itself, sent from safekeeper to proposer
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct VoteResponse {
|
||||
term: Term, // not really needed, just a sanity check
|
||||
vote_given: u64, // fixme u64 due to padding
|
||||
@@ -155,26 +152,26 @@ pub struct VoteResponse {
|
||||
|
||||
/// Request with WAL message sent from proposer to safekeeper. Along the way it
|
||||
/// announces 1) successful election (with epoch_start_lsn); 2) commit_lsn.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct AppendRequest {
|
||||
pub h: AppendRequestHeader,
|
||||
pub wal_data: Bytes,
|
||||
h: AppendRequestHeader,
|
||||
wal_data: Bytes,
|
||||
}
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AppendRequestHeader {
|
||||
pub term: Term,
|
||||
term: Term,
|
||||
// LSN since the proposer appends WAL; determines epoch switch point.
|
||||
pub epoch_start_lsn: Lsn,
|
||||
epoch_start_lsn: Lsn,
|
||||
/// start position of message in WAL
|
||||
pub begin_lsn: Lsn,
|
||||
begin_lsn: Lsn,
|
||||
/// end position of message in WAL
|
||||
pub end_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
/// LSN committed by quorum of safekeepers
|
||||
pub commit_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
/// minimal LSN which may be needed by proposer to perform recovery of some safekeeper
|
||||
pub truncate_lsn: Lsn,
|
||||
truncate_lsn: Lsn,
|
||||
// only for logging/debugging
|
||||
pub proposer_uuid: PgUuid,
|
||||
proposer_uuid: PgUuid,
|
||||
}
|
||||
|
||||
/// Report safekeeper state to proposer
|
||||
@@ -278,47 +275,8 @@ impl AcceptorProposerMessage {
|
||||
pub trait Storage {
|
||||
/// Persist safekeeper state on disk, optionally syncing it.
|
||||
fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()>;
|
||||
/// Write piece of wal in buf to disk and sync it.
|
||||
fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
|
||||
// i64 is faster than f64, so update to u64 when available.
|
||||
static ref FLUSH_LSN_GAUGE: GaugeVec = register_gauge_vec!(
|
||||
"safekeeper_flush_lsn",
|
||||
"Current flush_lsn, grouped by timeline",
|
||||
&["ztli"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_flush_lsn gauge vec");
|
||||
static ref COMMIT_LSN_GAUGE: GaugeVec = register_gauge_vec!(
|
||||
"safekeeper_commit_lsn",
|
||||
"Current commit_lsn (not necessarily persisted to disk), grouped by timeline",
|
||||
&["ztli"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_commit_lsn gauge vec");
|
||||
}
|
||||
|
||||
struct SafeKeeperMetrics {
|
||||
flush_lsn: Gauge,
|
||||
commit_lsn: Gauge,
|
||||
}
|
||||
|
||||
impl SafeKeeperMetrics {
|
||||
fn new(ztli: ZTimelineId) -> SafeKeeperMetrics {
|
||||
let ztli_str = format!("{}", ztli);
|
||||
SafeKeeperMetrics {
|
||||
flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&ztli_str]),
|
||||
commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&ztli_str]),
|
||||
}
|
||||
}
|
||||
|
||||
fn new_noname() -> SafeKeeperMetrics {
|
||||
SafeKeeperMetrics {
|
||||
flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&["n/a"]),
|
||||
commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&["n/a"]),
|
||||
}
|
||||
}
|
||||
/// Write piece of wal in buf to disk.
|
||||
fn write_wal(&mut self, s: &SafeKeeperState, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
}
|
||||
|
||||
/// SafeKeeper which consumes events (messages from compute) and provides
|
||||
@@ -328,8 +286,6 @@ pub struct SafeKeeper<ST: Storage> {
|
||||
/// Established by reading wal.
|
||||
pub flush_lsn: Lsn,
|
||||
pub tli: u32,
|
||||
// Cached metrics so we don't have to recompute labels on each update.
|
||||
metrics: SafeKeeperMetrics,
|
||||
/// not-yet-flushed pairs of same named fields in s.*
|
||||
pub commit_lsn: Lsn,
|
||||
pub truncate_lsn: Lsn,
|
||||
@@ -348,7 +304,6 @@ where
|
||||
SafeKeeper {
|
||||
flush_lsn,
|
||||
tli,
|
||||
metrics: SafeKeeperMetrics::new_noname(),
|
||||
commit_lsn: state.commit_lsn,
|
||||
truncate_lsn: state.truncate_lsn,
|
||||
storage,
|
||||
@@ -400,8 +355,6 @@ where
|
||||
self.s.server.wal_seg_size = msg.wal_seg_size;
|
||||
self.storage.persist(&self.s, true)?;
|
||||
|
||||
self.metrics = SafeKeeperMetrics::new(self.s.server.ztli);
|
||||
|
||||
info!(
|
||||
"processed greeting from proposer {:?}, sending term {:?}",
|
||||
msg.proposer_id, self.s.acceptor_state.term
|
||||
@@ -470,7 +423,7 @@ where
|
||||
let mut last_rec_lsn = Lsn(0);
|
||||
if !msg.wal_data.is_empty() {
|
||||
self.storage
|
||||
.write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
|
||||
.write_wal(&self.s, msg.h.begin_lsn, &msg.wal_data)?;
|
||||
|
||||
// figure out last record's end lsn for reporting (if we got the
|
||||
// whole record)
|
||||
@@ -525,7 +478,6 @@ where
|
||||
}
|
||||
if last_rec_lsn > self.flush_lsn {
|
||||
self.flush_lsn = last_rec_lsn;
|
||||
self.metrics.flush_lsn.set(u64::from(self.flush_lsn) as f64);
|
||||
}
|
||||
|
||||
// Advance commit_lsn taking into account what we have locally. xxx this
|
||||
@@ -543,9 +495,6 @@ where
|
||||
sync_control_file |=
|
||||
commit_lsn >= msg.h.epoch_start_lsn && self.s.commit_lsn < msg.h.epoch_start_lsn;
|
||||
self.commit_lsn = commit_lsn;
|
||||
self.metrics
|
||||
.commit_lsn
|
||||
.set(u64::from(self.commit_lsn) as f64);
|
||||
}
|
||||
|
||||
self.truncate_lsn = msg.h.truncate_lsn;
|
||||
@@ -597,7 +546,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_wal(&mut self, _server: &ServerInfo, _startpos: Lsn, _buf: &[u8]) -> Result<()> {
|
||||
fn write_wal(&mut self, _s: &SafeKeeperState, _startpos: Lsn, _buf: &[u8]) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
//! pageserver/any other consumer.
|
||||
//!
|
||||
|
||||
use crate::json_ctrl::handle_json_ctrl;
|
||||
use crate::receive_wal::ReceiveWalConn;
|
||||
use crate::replication::ReplicationConn;
|
||||
use crate::timeline::{Timeline, TimelineTools};
|
||||
@@ -13,13 +12,14 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use zenith_utils::postgres_backend;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::pq_proto::{BeMessage, FeStartupMessage, RowDescriptor, INT4_OID, TEXT_OID};
|
||||
use zenith_utils::pq_proto::{BeMessage, FeStartupMessage, RowDescriptor};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::timeline::CreateControlFile;
|
||||
|
||||
/// Handler for streaming WAL from acceptor
|
||||
pub struct SendWalHandler {
|
||||
/// wal acceptor configuration
|
||||
pub conf: WalAcceptorConf,
|
||||
/// assigned application name
|
||||
pub appname: Option<String>,
|
||||
@@ -49,11 +49,9 @@ impl postgres_backend::Handler for SendWalHandler {
|
||||
}
|
||||
|
||||
fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: Bytes) -> Result<()> {
|
||||
// START_WAL_PUSH is the only command that initializes the timeline in production.
|
||||
// There is also JSON_CTRL command, which should initialize the timeline for testing.
|
||||
// START_WAL_PUSH is the only command that initializes the timeline
|
||||
if self.timeline.is_none() {
|
||||
if query_string.starts_with(b"START_WAL_PUSH") || query_string.starts_with(b"JSON_CTRL")
|
||||
{
|
||||
if query_string.starts_with(b"START_WAL_PUSH") {
|
||||
self.timeline.set(
|
||||
&self.conf,
|
||||
self.tenantid.unwrap(),
|
||||
@@ -71,16 +69,16 @@ impl postgres_backend::Handler for SendWalHandler {
|
||||
}
|
||||
if query_string.starts_with(b"IDENTIFY_SYSTEM") {
|
||||
self.handle_identify_system(pgb)?;
|
||||
Ok(())
|
||||
} else if query_string.starts_with(b"START_REPLICATION") {
|
||||
ReplicationConn::new(pgb).run(self, pgb, &query_string)?;
|
||||
Ok(())
|
||||
} else if query_string.starts_with(b"START_WAL_PUSH") {
|
||||
ReceiveWalConn::new(pgb)?.run(self)?;
|
||||
} else if query_string.starts_with(b"JSON_CTRL") {
|
||||
handle_json_ctrl(self, pgb, &query_string)?;
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("Unexpected command {:?}", query_string);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,25 +108,25 @@ impl SendWalHandler {
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor {
|
||||
name: b"systemid",
|
||||
typoid: TEXT_OID,
|
||||
typoid: 25,
|
||||
typlen: -1,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"timeline",
|
||||
typoid: INT4_OID,
|
||||
typoid: 23,
|
||||
typlen: 4,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"xlogpos",
|
||||
typoid: TEXT_OID,
|
||||
typoid: 25,
|
||||
typlen: -1,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"dbname",
|
||||
typoid: TEXT_OID,
|
||||
typoid: 25,
|
||||
typlen: -1,
|
||||
..Default::default()
|
||||
},
|
||||
|
||||
@@ -18,8 +18,8 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::replication::{HotStandbyFeedback, END_REPLICATION_MARKER};
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, ServerInfo,
|
||||
Storage, SK_FORMAT_VERSION, SK_MAGIC,
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, Storage,
|
||||
SK_FORMAT_VERSION, SK_MAGIC,
|
||||
};
|
||||
use crate::WalAcceptorConf;
|
||||
use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ};
|
||||
@@ -32,7 +32,7 @@ struct SharedState {
|
||||
sk: SafeKeeper<FileStorage>,
|
||||
/// For receiving-sending wal cooperation
|
||||
/// quorum commit LSN we've notified walsenders about
|
||||
notified_commit_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
/// combined hot standby feedback from all replicas
|
||||
hs_feedback: HotStandbyFeedback,
|
||||
}
|
||||
@@ -72,7 +72,7 @@ impl SharedState {
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
notified_commit_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
sk: SafeKeeper::new(Lsn(flush_lsn), tli, storage, state),
|
||||
hs_feedback: HotStandbyFeedback {
|
||||
ts: 0,
|
||||
@@ -112,7 +112,7 @@ impl SharedState {
|
||||
}
|
||||
match opts.open(&control_file_path) {
|
||||
Ok(mut file) => {
|
||||
// Lock file to prevent two or more active safekeepers
|
||||
// Lock file to prevent two or more active wal_acceptors
|
||||
match file.try_lock_exclusive() {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
@@ -186,7 +186,7 @@ impl Timeline {
|
||||
pub fn wait_for_lsn(&self, lsn: Lsn) -> Lsn {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
loop {
|
||||
let commit_lsn = shared_state.notified_commit_lsn;
|
||||
let commit_lsn = shared_state.commit_lsn;
|
||||
// This must be `>`, not `>=`.
|
||||
if commit_lsn > lsn {
|
||||
return commit_lsn;
|
||||
@@ -198,8 +198,8 @@ impl Timeline {
|
||||
// Notify caught-up WAL senders about new WAL data received
|
||||
pub fn notify_wal_senders(&self, commit_lsn: Lsn) {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
if shared_state.notified_commit_lsn < commit_lsn {
|
||||
shared_state.notified_commit_lsn = commit_lsn;
|
||||
if shared_state.commit_lsn < commit_lsn {
|
||||
shared_state.commit_lsn = commit_lsn;
|
||||
self.cond.notify_all();
|
||||
}
|
||||
}
|
||||
@@ -289,7 +289,7 @@ lazy_static! {
|
||||
}
|
||||
|
||||
/// A zero-sized struct used to manage access to the global timelines map.
|
||||
pub struct GlobalTimelines;
|
||||
struct GlobalTimelines;
|
||||
|
||||
impl GlobalTimelines {
|
||||
/// Get a timeline with control file loaded from the global TIMELINES map.
|
||||
@@ -337,14 +337,14 @@ impl Storage for FileStorage {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
fn write_wal(&mut self, s: &SafeKeeperState, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
let mut bytes_left: usize = buf.len();
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
let wal_seg_size = server.wal_seg_size as usize;
|
||||
let ztli = server.ztli;
|
||||
let wal_seg_size = s.server.wal_seg_size as usize;
|
||||
let ztli = s.server.ztli;
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size) as usize;
|
||||
@@ -365,7 +365,7 @@ impl Storage for FileStorage {
|
||||
/* Open file */
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
// note: we basically don't support changing pg timeline
|
||||
let wal_file_name = XLogFileName(server.tli, segno, wal_seg_size);
|
||||
let wal_file_name = XLogFileName(s.server.tli, segno, wal_seg_size);
|
||||
let wal_file_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
|
||||
@@ -12,7 +12,13 @@ use crate::WalAcceptorConf;
|
||||
use zenith_utils::postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
pub fn thread_main(conf: WalAcceptorConf, listener: TcpListener) -> Result<()> {
|
||||
pub fn thread_main(conf: WalAcceptorConf) -> Result<()> {
|
||||
info!("Starting wal acceptor on {}", conf.listen_addr);
|
||||
let listener = TcpListener::bind(conf.listen_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
loop {
|
||||
match listener.accept() {
|
||||
Ok((socket, peer_addr)) => {
|
||||
@@ -35,8 +41,8 @@ fn handle_socket(socket: TcpStream, conf: WalAcceptorConf) -> Result<()> {
|
||||
socket.set_nodelay(true)?;
|
||||
|
||||
let mut conn_handler = SendWalHandler::new(conf);
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, false)?;
|
||||
// libpq replication protocol between safekeeper and replicas/pagers
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
|
||||
// libpq replication protocol between wal_acceptor and replicas/pagers
|
||||
pgbackend.run(&mut conn_handler)?;
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -88,12 +88,7 @@ fn main() -> Result<()> {
|
||||
)
|
||||
.subcommand(SubCommand::with_name("status"))
|
||||
.subcommand(SubCommand::with_name("start").about("Start local pageserver"))
|
||||
.subcommand(SubCommand::with_name("stop").about("Stop local pageserver")
|
||||
.arg(Arg::with_name("immediate")
|
||||
.help("Don't flush repository data at shutdown")
|
||||
.required(false)
|
||||
)
|
||||
)
|
||||
.subcommand(SubCommand::with_name("stop").about("Stop local pageserver"))
|
||||
.subcommand(SubCommand::with_name("restart").about("Restart local pageserver"))
|
||||
.subcommand(
|
||||
SubCommand::with_name("pg")
|
||||
@@ -201,12 +196,10 @@ fn main() -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
("stop", Some(stop_match)) => {
|
||||
("stop", Some(_sub_m)) => {
|
||||
let pageserver = PageServerNode::from_env(&env);
|
||||
|
||||
let immediate = stop_match.is_present("immediate");
|
||||
|
||||
if let Err(e) = pageserver.stop(immediate) {
|
||||
if let Err(e) = pageserver.stop() {
|
||||
eprintln!("pageserver stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -215,8 +208,7 @@ fn main() -> Result<()> {
|
||||
("restart", Some(_sub_m)) => {
|
||||
let pageserver = PageServerNode::from_env(&env);
|
||||
|
||||
//TODO what shutdown strategy should we use here?
|
||||
if let Err(e) = pageserver.stop(false) {
|
||||
if let Err(e) = pageserver.stop() {
|
||||
eprintln!("pageserver stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@@ -7,4 +7,3 @@ edition = "2018"
|
||||
prometheus = {version = "0.12", default_features=false} # removes protobuf dependency
|
||||
libc = "0.2"
|
||||
lazy_static = "1.4"
|
||||
once_cell = "1.8.0"
|
||||
|
||||
@@ -3,10 +3,7 @@
|
||||
//! Otherwise, we might not see all metrics registered via
|
||||
//! a default registry.
|
||||
use lazy_static::lazy_static;
|
||||
use once_cell::race::OnceBox;
|
||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||
pub use prometheus::{register_gauge, Gauge};
|
||||
pub use prometheus::{register_gauge_vec, GaugeVec};
|
||||
pub use prometheus::{register_histogram, Histogram};
|
||||
pub use prometheus::{register_histogram_vec, HistogramVec};
|
||||
pub use prometheus::{register_int_counter, IntCounter};
|
||||
@@ -23,55 +20,17 @@ pub use wrappers::{CountedReader, CountedWriter};
|
||||
/// Metrics gathering is a relatively simple and standalone operation, so
|
||||
/// it might be fine to do it this way to keep things simple.
|
||||
pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
|
||||
update_rusage_metrics();
|
||||
update_io_metrics();
|
||||
prometheus::gather()
|
||||
}
|
||||
|
||||
static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new();
|
||||
|
||||
/// Sets a prefix which will be used for all common metrics, typically a service
|
||||
/// name like 'pageserver'. Should be executed exactly once in the beginning of
|
||||
/// any executable which uses common metrics.
|
||||
pub fn set_common_metrics_prefix(prefix: &'static str) {
|
||||
// Not unwrap() because metrics may be initialized after multiple threads have been started.
|
||||
COMMON_METRICS_PREFIX
|
||||
.set(prefix.into())
|
||||
.unwrap_or_else(|_| {
|
||||
eprintln!(
|
||||
"set_common_metrics_prefix() was called second time with '{}', exiting",
|
||||
prefix
|
||||
);
|
||||
std::process::exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
/// Prepends a prefix to a common metric name so they are distinguished between
|
||||
/// different services, see <https://github.com/zenithdb/zenith/pull/681>
|
||||
/// A call to set_common_metrics_prefix() is necessary prior to calling this.
|
||||
pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String {
|
||||
// Not unwrap() because metrics may be initialized after multiple threads have been started.
|
||||
format!(
|
||||
"{}_{}",
|
||||
COMMON_METRICS_PREFIX.get().unwrap_or_else(|| {
|
||||
eprintln!("set_common_metrics_prefix() was not called, but metrics are used, exiting");
|
||||
std::process::exit(1);
|
||||
}),
|
||||
unprefixed_metric_name
|
||||
)
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
|
||||
new_common_metric_name("disk_io_bytes"),
|
||||
"pageserver_disk_io_bytes",
|
||||
"Bytes written and read from disk, grouped by the operation (read|write)",
|
||||
&["io_operation"]
|
||||
)
|
||||
.expect("Failed to register disk i/o bytes int gauge vec");
|
||||
static ref MAXRSS_KB: IntGauge = register_int_gauge!(
|
||||
new_common_metric_name("maxrss_kb"),
|
||||
"Memory usage (Maximum Resident Set Size)"
|
||||
)
|
||||
.expect("Failed to register maxrss_kb int gauge");
|
||||
}
|
||||
|
||||
// Records I/O stats in a "cross-platform" way.
|
||||
@@ -83,7 +42,7 @@ lazy_static! {
|
||||
// We know the size of the block, so we can determine the I/O bytes out of it.
|
||||
// The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
fn update_rusage_metrics() {
|
||||
fn update_io_metrics() {
|
||||
let rusage_stats = get_rusage_stats();
|
||||
|
||||
const BYTES_IN_BLOCK: i64 = 512;
|
||||
@@ -93,7 +52,6 @@ fn update_rusage_metrics() {
|
||||
DISK_IO_BYTES
|
||||
.with_label_values(&["write"])
|
||||
.set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
|
||||
MAXRSS_KB.set(rusage_stats.ru_maxrss);
|
||||
}
|
||||
|
||||
fn get_rusage_stats() -> libc::rusage {
|
||||
|
||||
@@ -18,9 +18,12 @@ serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
thiserror = "1.0"
|
||||
tokio = "1.11"
|
||||
tracing = "0.1"
|
||||
tracing-log = "0.1"
|
||||
tracing-subscriber = "0.2"
|
||||
|
||||
slog-async = "2.6.0"
|
||||
slog-stdlog = "4.1.0"
|
||||
slog-scope = "4.4.0"
|
||||
slog-term = "2.8.0"
|
||||
slog = "2.7.0"
|
||||
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
@@ -35,4 +38,3 @@ rustls-split = "0.2.1"
|
||||
hex-literal = "0.3"
|
||||
bytes = "1.0"
|
||||
webpki = "0.21"
|
||||
tempfile = "3.2"
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you
|
||||
/// feed the accumulated values by calling the 'accum' function, instead of having an
|
||||
/// iterator.
|
||||
///
|
||||
/// For example, to calculate the smallest value among some integers:
|
||||
///
|
||||
/// ```
|
||||
/// use zenith_utils::accum::Accum;
|
||||
///
|
||||
/// let values = [1, 2, 3];
|
||||
///
|
||||
/// let mut min_value: Accum<u32> = Accum(None);
|
||||
/// for new_value in &values {
|
||||
/// min_value.accum(std::cmp::min, *new_value);
|
||||
/// }
|
||||
///
|
||||
/// assert_eq!(min_value.0.unwrap(), 1);
|
||||
/// ```
|
||||
pub struct Accum<T>(pub Option<T>);
|
||||
impl<T: Copy> Accum<T> {
|
||||
pub fn accum<F>(&mut self, func: F, new_value: T)
|
||||
where
|
||||
F: FnOnce(T, T) -> T,
|
||||
{
|
||||
// If there is no previous value, just store the new value.
|
||||
// Otherwise call the function to decide which one to keep.
|
||||
self.0 = Some(if let Some(accum) = self.0 {
|
||||
func(accum, new_value)
|
||||
} else {
|
||||
new_value
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -94,12 +94,9 @@ pub fn le_coder() -> impl Options {
|
||||
|
||||
/// Binary serialize/deserialize helper functions (Big Endian)
|
||||
///
|
||||
pub trait BeSer {
|
||||
pub trait BeSer: Serialize + DeserializeOwned {
|
||||
/// Serialize into a byte slice
|
||||
fn ser_into_slice(&self, mut b: &mut [u8]) -> Result<(), SerializeError>
|
||||
where
|
||||
Self: Serialize,
|
||||
{
|
||||
fn ser_into_slice(&self, mut b: &mut [u8]) -> Result<(), SerializeError> {
|
||||
// &mut [u8] implements Write, but `ser_into` needs a mutable
|
||||
// reference to that. So we need the slightly awkward "mutable
|
||||
// reference to a mutable reference.
|
||||
@@ -110,28 +107,19 @@ pub trait BeSer {
|
||||
///
|
||||
/// This is useful for most `Write` types except `&mut [u8]`, which
|
||||
/// can more easily use [`ser_into_slice`](Self::ser_into_slice).
|
||||
fn ser_into<W: Write>(&self, w: &mut W) -> Result<(), SerializeError>
|
||||
where
|
||||
Self: Serialize,
|
||||
{
|
||||
fn ser_into<W: Write>(&self, w: &mut W) -> Result<(), SerializeError> {
|
||||
be_coder().serialize_into(w, &self).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
/// Serialize into a new heap-allocated buffer
|
||||
fn ser(&self) -> Result<Vec<u8>, SerializeError>
|
||||
where
|
||||
Self: Serialize,
|
||||
{
|
||||
fn ser(&self) -> Result<Vec<u8>, SerializeError> {
|
||||
be_coder().serialize(&self).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
/// Deserialize from the full contents of a byte slice
|
||||
///
|
||||
/// See also: [`BeSer::des_prefix`]
|
||||
fn des(buf: &[u8]) -> Result<Self, DeserializeError>
|
||||
where
|
||||
Self: DeserializeOwned,
|
||||
{
|
||||
fn des(buf: &[u8]) -> Result<Self, DeserializeError> {
|
||||
be_coder()
|
||||
.deserialize(buf)
|
||||
.or(Err(DeserializeError::BadInput))
|
||||
@@ -143,10 +131,7 @@ pub trait BeSer {
|
||||
/// type, but does not guarantee that the entire slice is used.
|
||||
///
|
||||
/// See also: [`BeSer::des`]
|
||||
fn des_prefix(buf: &[u8]) -> Result<Self, DeserializeError>
|
||||
where
|
||||
Self: DeserializeOwned,
|
||||
{
|
||||
fn des_prefix(buf: &[u8]) -> Result<Self, DeserializeError> {
|
||||
be_coder()
|
||||
.allow_trailing_bytes()
|
||||
.deserialize(buf)
|
||||
@@ -154,10 +139,7 @@ pub trait BeSer {
|
||||
}
|
||||
|
||||
/// Deserialize from a reader
|
||||
fn des_from<R: Read>(r: &mut R) -> Result<Self, DeserializeError>
|
||||
where
|
||||
Self: DeserializeOwned,
|
||||
{
|
||||
fn des_from<R: Read>(r: &mut R) -> Result<Self, DeserializeError> {
|
||||
be_coder().deserialize_from(r).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
@@ -165,22 +147,16 @@ pub trait BeSer {
|
||||
///
|
||||
/// Note: it may be faster to serialize to a buffer and then measure the
|
||||
/// buffer length, than to call `serialized_size` and then `ser_into`.
|
||||
fn serialized_size(&self) -> Result<u64, SerializeError>
|
||||
where
|
||||
Self: Serialize,
|
||||
{
|
||||
fn serialized_size(&self) -> Result<u64, SerializeError> {
|
||||
be_coder().serialized_size(self).map_err(|e| e.into())
|
||||
}
|
||||
}
|
||||
|
||||
/// Binary serialize/deserialize helper functions (Little Endian)
|
||||
///
|
||||
pub trait LeSer {
|
||||
pub trait LeSer: Serialize + DeserializeOwned {
|
||||
/// Serialize into a byte slice
|
||||
fn ser_into_slice(&self, mut b: &mut [u8]) -> Result<(), SerializeError>
|
||||
where
|
||||
Self: Serialize,
|
||||
{
|
||||
fn ser_into_slice(&self, mut b: &mut [u8]) -> Result<(), SerializeError> {
|
||||
// &mut [u8] implements Write, but `ser_into` needs a mutable
|
||||
// reference to that. So we need the slightly awkward "mutable
|
||||
// reference to a mutable reference.
|
||||
@@ -191,28 +167,19 @@ pub trait LeSer {
|
||||
///
|
||||
/// This is useful for most `Write` types except `&mut [u8]`, which
|
||||
/// can more easily use [`ser_into_slice`](Self::ser_into_slice).
|
||||
fn ser_into<W: Write>(&self, w: &mut W) -> Result<(), SerializeError>
|
||||
where
|
||||
Self: Serialize,
|
||||
{
|
||||
fn ser_into<W: Write>(&self, w: &mut W) -> Result<(), SerializeError> {
|
||||
le_coder().serialize_into(w, &self).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
/// Serialize into a new heap-allocated buffer
|
||||
fn ser(&self) -> Result<Vec<u8>, SerializeError>
|
||||
where
|
||||
Self: Serialize,
|
||||
{
|
||||
fn ser(&self) -> Result<Vec<u8>, SerializeError> {
|
||||
le_coder().serialize(&self).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
/// Deserialize from the full contents of a byte slice
|
||||
///
|
||||
/// See also: [`LeSer::des_prefix`]
|
||||
fn des(buf: &[u8]) -> Result<Self, DeserializeError>
|
||||
where
|
||||
Self: DeserializeOwned,
|
||||
{
|
||||
fn des(buf: &[u8]) -> Result<Self, DeserializeError> {
|
||||
le_coder()
|
||||
.deserialize(buf)
|
||||
.or(Err(DeserializeError::BadInput))
|
||||
@@ -224,10 +191,7 @@ pub trait LeSer {
|
||||
/// type, but does not guarantee that the entire slice is used.
|
||||
///
|
||||
/// See also: [`LeSer::des`]
|
||||
fn des_prefix(buf: &[u8]) -> Result<Self, DeserializeError>
|
||||
where
|
||||
Self: DeserializeOwned,
|
||||
{
|
||||
fn des_prefix(buf: &[u8]) -> Result<Self, DeserializeError> {
|
||||
le_coder()
|
||||
.allow_trailing_bytes()
|
||||
.deserialize(buf)
|
||||
@@ -235,10 +199,7 @@ pub trait LeSer {
|
||||
}
|
||||
|
||||
/// Deserialize from a reader
|
||||
fn des_from<R: Read>(r: &mut R) -> Result<Self, DeserializeError>
|
||||
where
|
||||
Self: DeserializeOwned,
|
||||
{
|
||||
fn des_from<R: Read>(r: &mut R) -> Result<Self, DeserializeError> {
|
||||
le_coder().deserialize_from(r).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
@@ -246,18 +207,14 @@ pub trait LeSer {
|
||||
///
|
||||
/// Note: it may be faster to serialize to a buffer and then measure the
|
||||
/// buffer length, than to call `serialized_size` and then `ser_into`.
|
||||
fn serialized_size(&self) -> Result<u64, SerializeError>
|
||||
where
|
||||
Self: Serialize,
|
||||
{
|
||||
fn serialized_size(&self) -> Result<u64, SerializeError> {
|
||||
le_coder().serialized_size(self).map_err(|e| e.into())
|
||||
}
|
||||
}
|
||||
|
||||
// Because usage of `BeSer` or `LeSer` can be done with *either* a Serialize or
|
||||
// DeserializeOwned implementation, the blanket implementation has to be for every type.
|
||||
impl<T> BeSer for T {}
|
||||
impl<T> LeSer for T {}
|
||||
impl<T> BeSer for T where T: Serialize + DeserializeOwned {}
|
||||
|
||||
impl<T> LeSer for T where T: Serialize + DeserializeOwned {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -29,24 +29,24 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_connection_host_port() {
|
||||
let config: Config = "postgresql://no_user@127.0.0.1:64000/no_db"
|
||||
let config: Config = "postgresql://no_user@localhost:64000/no_db"
|
||||
.parse()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
connection_host_port(&config),
|
||||
("127.0.0.1".to_owned(), 64000)
|
||||
("localhost".to_owned(), 64000)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "only one pair of host and port is supported in connection string")]
|
||||
fn test_connection_host_port_multiple_ports() {
|
||||
let config: Config = "postgresql://no_user@127.0.0.1:64000,127.0.0.1:64001/no_db"
|
||||
let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
|
||||
.parse()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
connection_host_port(&config),
|
||||
("127.0.0.1".to_owned(), 64000)
|
||||
("localhost".to_owned(), 64000)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user