mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 05:20:38 +00:00
Compare commits
36 Commits
walredo-co
...
vec-heap
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2975c26de7 | ||
|
|
5f4f7d9762 | ||
|
|
f658263543 | ||
|
|
64ca947722 | ||
|
|
23f4c0a742 | ||
|
|
7c5b99683c | ||
|
|
160c4aff61 | ||
|
|
6e5ca5dc5c | ||
|
|
f3445949d1 | ||
|
|
95a85312f5 | ||
|
|
934fb8592f | ||
|
|
bb239b4f69 | ||
|
|
1cd7900790 | ||
|
|
8c61c3e54e | ||
|
|
d7c9dd06f4 | ||
|
|
b9119f11bf | ||
|
|
7216f22609 | ||
|
|
bf58f7f649 | ||
|
|
3f0ebc6a40 | ||
|
|
0baf4bc796 | ||
|
|
c356030660 | ||
|
|
c4bb6d78d4 | ||
|
|
3b82e806f2 | ||
|
|
403d9779d9 | ||
|
|
b3b8f18f61 | ||
|
|
960c7d69a8 | ||
|
|
60dae0b4ac | ||
|
|
c660926a06 | ||
|
|
7fa04e2d14 | ||
|
|
db4059cd6d | ||
|
|
fdb19fdb92 | ||
|
|
53b4dc944d | ||
|
|
a03e1b3895 | ||
|
|
15f1bcc9c2 | ||
|
|
24580f2493 | ||
|
|
e3945d94fd |
@@ -24,6 +24,12 @@ jobs:
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: zenith-build-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
enum: ["debug", "release"]
|
||||
environment:
|
||||
BUILD_TYPE: << parameters.build_type >>
|
||||
steps:
|
||||
# Checkout the git repo (circleci doesn't have a flag to enable submodules here)
|
||||
- checkout
|
||||
@@ -39,7 +45,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
# FIXME We could cache our own docker container, instead of installing packages every time.
|
||||
- run:
|
||||
@@ -59,12 +65,12 @@ jobs:
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
# "depth 1" saves some time by not cloning the whole repo
|
||||
git submodule update --init --depth 1
|
||||
make postgres
|
||||
make postgres -j8
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save postgres cache
|
||||
key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
@@ -96,7 +102,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
@@ -254,7 +260,7 @@ jobs:
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
@@ -328,14 +334,18 @@ workflows:
|
||||
build_and_test:
|
||||
jobs:
|
||||
- check-codestyle
|
||||
- build-postgres
|
||||
- build-postgres:
|
||||
name: build-postgres-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
requires:
|
||||
- build-postgres
|
||||
- build-postgres-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: pg_regress-tests-<< matrix.build_type >>
|
||||
matrix:
|
||||
|
||||
@@ -11,4 +11,8 @@ test_output
|
||||
.vscode
|
||||
.zenith
|
||||
integration_tests/.zenith
|
||||
.mypy_cache
|
||||
.mypy_cache
|
||||
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
|
||||
|
||||
247
Cargo.lock
generated
247
Cargo.lock
generated
@@ -26,18 +26,21 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ansi_term"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.42"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "595d3cfa7a60d4555cb5067b99f07142a08ea778de5cf993f7b75c7d8fabc486"
|
||||
|
||||
[[package]]
|
||||
name = "arc-swap"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e906254e445520903e7fc9da4f709886c84ae4bc4ddaf0e093188d66df4dc820"
|
||||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.50"
|
||||
@@ -298,7 +301,7 @@ version = "2.33.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
|
||||
dependencies = [
|
||||
"ansi_term",
|
||||
"ansi_term 0.11.0",
|
||||
"atty",
|
||||
"bitflags",
|
||||
"strsim",
|
||||
@@ -387,26 +390,6 @@ dependencies = [
|
||||
"rustc_version",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crypto-mac"
|
||||
version = "0.10.0"
|
||||
@@ -445,16 +428,6 @@ dependencies = [
|
||||
"dirs-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-next"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"dirs-sys-next",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys"
|
||||
version = "0.3.6"
|
||||
@@ -466,17 +439,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys-next"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"redox_users",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dlv-list"
|
||||
version = "0.2.3"
|
||||
@@ -956,6 +918,15 @@ dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
version = "0.1.8"
|
||||
@@ -1220,10 +1191,12 @@ dependencies = [
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"signal-hook",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"toml",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
@@ -1531,6 +1504,15 @@ dependencies = [
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
|
||||
dependencies = [
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.25"
|
||||
@@ -1689,12 +1671,6 @@ dependencies = [
|
||||
"webpki",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.5"
|
||||
@@ -1852,12 +1828,32 @@ dependencies = [
|
||||
"opaque-debug",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42a568c8f2cd051a4d283bd6eb0343ac214c1b0f1ac19f93e1175b2dee38c73d"
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook"
|
||||
version = "0.3.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c98891d737e271a2954825ef19e46bd16bdb98e2746f2eec4f7a4ef7946efd1"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"signal-hook-registry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.4.0"
|
||||
@@ -1890,59 +1886,6 @@ version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527"
|
||||
|
||||
[[package]]
|
||||
name = "slog"
|
||||
version = "2.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06"
|
||||
|
||||
[[package]]
|
||||
name = "slog-async"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c60813879f820c85dbc4eabf3269befe374591289019775898d56a81a804fbdc"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"slog",
|
||||
"take_mut",
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-scope"
|
||||
version = "4.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f95a4b4c3274cd2869549da82b57ccc930859bdbf5bcea0424bc5f140b3c786"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"lazy_static",
|
||||
"slog",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-stdlog"
|
||||
version = "4.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8228ab7302adbf4fcb37e66f3cda78003feb521e7fd9e3847ec117a7784d0f5a"
|
||||
dependencies = [
|
||||
"log",
|
||||
"slog",
|
||||
"slog-scope",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-term"
|
||||
version = "2.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95c1e7e5aab61ced6006149ea772770b84a0d16ce0f7885def313e4829946d76"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"chrono",
|
||||
"slog",
|
||||
"term",
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.6.1"
|
||||
@@ -1998,12 +1941,6 @@ dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "take_mut"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
|
||||
|
||||
[[package]]
|
||||
name = "tap"
|
||||
version = "1.0.1"
|
||||
@@ -2035,17 +1972,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "term"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f"
|
||||
dependencies = [
|
||||
"dirs-next",
|
||||
"rustversion",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.1.2"
|
||||
@@ -2223,24 +2149,79 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.26"
|
||||
version = "0.1.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
|
||||
checksum = "375a639232caf30edfc78e8d89b2d4c375515393e7af7e16f01cd96917fb2105"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052"
|
||||
checksum = "f4f480b8f81512e825f337ad51e94c1eb5d3bbdf2b363dcd01e2b19a9ffe3f8e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-log"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-serde"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.2.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e0d2eaa99c3c2e41547cfa109e910a68ea03823cccad4a0525dcbc9b01e8c71"
|
||||
dependencies = [
|
||||
"ansi_term 0.12.1",
|
||||
"chrono",
|
||||
"lazy_static",
|
||||
"matchers",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.3"
|
||||
@@ -2339,6 +2320,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"clap",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
"fs2",
|
||||
@@ -2358,6 +2340,7 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
]
|
||||
|
||||
@@ -2603,14 +2586,12 @@ dependencies = [
|
||||
"rustls-split",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"slog",
|
||||
"slog-async",
|
||||
"slog-scope",
|
||||
"slog-stdlog",
|
||||
"slog-term",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-log",
|
||||
"tracing-subscriber",
|
||||
"webpki",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
|
||||
@@ -10,6 +10,7 @@ FROM zenithdb/build:buster AS pg-build
|
||||
WORKDIR /zenith
|
||||
COPY ./vendor/postgres vendor/postgres
|
||||
COPY ./Makefile Makefile
|
||||
ENV BUILD_TYPE release
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
|
||||
RUN rm -rf postgres_install/build
|
||||
|
||||
@@ -36,7 +37,9 @@ RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl
|
||||
mkdir zenith_install
|
||||
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
|
||||
# TODO: temporary alias for compatibility, see https://github.com/zenithdb/zenith/pull/740
|
||||
RUN ln -s /usr/local/bin/safekeeper /usr/local/bin/wal_acceptor
|
||||
COPY --from=build /zenith/target/release/proxy /usr/local/bin
|
||||
COPY --from=pg-build /zenith/tmp_install postgres_install
|
||||
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||
|
||||
@@ -81,7 +81,9 @@ FROM alpine:3.13
|
||||
RUN apk add --update openssl build-base libseccomp-dev
|
||||
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
|
||||
# TODO: temporary alias for compatibility, see https://github.com/zenithdb/zenith/pull/740
|
||||
RUN ln -s /usr/local/bin/safekeeper /usr/local/bin/wal_acceptor
|
||||
COPY --from=build /zenith/target/release/proxy /usr/local/bin
|
||||
COPY --from=pg-build /zenith/tmp_install /usr/local
|
||||
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||
|
||||
58
Makefile
58
Makefile
@@ -6,34 +6,55 @@ else
|
||||
SECCOMP =
|
||||
endif
|
||||
|
||||
#
|
||||
# We differentiate between release / debug build types using the BUILD_TYPE
|
||||
# environment variable.
|
||||
#
|
||||
BUILD_TYPE ?= debug
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug
|
||||
PG_CFLAGS = -O2 -g3 $(CFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 $(CFLAGS)
|
||||
else
|
||||
$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
# Choose whether we should be silent or verbose
|
||||
CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
|
||||
# Fix for a corner case when make doesn't pass a jobserver
|
||||
CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
||||
|
||||
# This option has a side effect of passing make jobserver to cargo.
|
||||
# However, we shouldn't do this if `make -n` (--dry-run) has been asked.
|
||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||
# Force cargo not to print progress bar
|
||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
|
||||
#
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
.PHONY: all
|
||||
all: zenith postgres
|
||||
|
||||
# We don't want to run 'cargo build' in parallel with the postgres build,
|
||||
# because interleaving cargo build output with postgres build output looks
|
||||
# confusing. Also, 'cargo build' is parallel on its own, so it would be too
|
||||
# much parallelism. (Recursive invocation of postgres target still gets any
|
||||
# '-j' flag from the command line, so 'make -j' is still useful.)
|
||||
.NOTPARALLEL:
|
||||
|
||||
### Zenith Rust bits
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: zenith
|
||||
zenith: postgres-headers
|
||||
cargo build
|
||||
+@echo "Compiling Zenith"
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||
|
||||
### PostgreSQL parts
|
||||
tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
|
||||
--enable-cassert \
|
||||
--enable-debug \
|
||||
--enable-depend \
|
||||
../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) \
|
||||
$(SECCOMP) \
|
||||
--prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
@@ -47,10 +68,10 @@ postgres-headers: postgres-configure
|
||||
+@echo "Installing PostgreSQL headers"
|
||||
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
|
||||
|
||||
|
||||
# Compile and install PostgreSQL and contrib/zenith
|
||||
.PHONY: postgres
|
||||
postgres: postgres-configure
|
||||
postgres: postgres-configure \
|
||||
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
||||
+@echo "Compiling contrib/zenith"
|
||||
@@ -58,18 +79,21 @@ postgres: postgres-configure
|
||||
+@echo "Compiling contrib/zenith_test_utils"
|
||||
$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean:
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean:
|
||||
cd tmp_install/build && ${MAKE} clean
|
||||
cargo clean
|
||||
cd tmp_install/build && $(MAKE) clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
# This removes everything
|
||||
.PHONY: distclean
|
||||
distclean:
|
||||
rm -rf tmp_install
|
||||
cargo clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
.PHONY: fmt
|
||||
fmt:
|
||||
|
||||
@@ -199,23 +199,45 @@ impl PageServerNode {
|
||||
bail!("pageserver failed to start in {} seconds", RETRIES);
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> anyhow::Result<()> {
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
let pid = read_pidfile(&self.pid_file())?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to kill pageserver with pid {}", pid);
|
||||
if immediate {
|
||||
println!("Stop pageserver immediately");
|
||||
if kill(pid, Signal::SIGQUIT).is_err() {
|
||||
bail!("Failed to kill pageserver with pid {}", pid);
|
||||
}
|
||||
} else {
|
||||
println!("Stop pageserver gracefully");
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
}
|
||||
}
|
||||
|
||||
// wait for pageserver stop
|
||||
let address = connection_address(&self.pg_connection_config);
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(&address);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
if let Err(_e) = stream {
|
||||
println!("Pageserver stopped");
|
||||
return Ok(());
|
||||
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if pageserver checkpoints a lot of data
|
||||
for _ in 0..100 {
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Pageserver stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("Pageserver status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
println!("Pageserver status is: {}", err);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Pageserver still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
println!("Stopping pageserver on {}", address);
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
@@ -313,8 +335,9 @@ impl PageServerNode {
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
// TODO Looks like this flag is never set
|
||||
if self.kill_on_exit {
|
||||
let _ = self.stop();
|
||||
let _ = self.stop(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
Currently we build two main images:
|
||||
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
|
||||
|
||||
And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
|
||||
|
||||
@@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id
|
||||
|
||||
### Safety
|
||||
|
||||
For now particular tenant can only appear on a particular pageserver. Set of WAL acceptors are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
|
||||
For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
|
||||
|
||||
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.11", features = ["process", "macros", "fs"] }
|
||||
tokio = { version = "1.11", features = ["process", "macros", "fs", "rt"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
@@ -35,6 +35,8 @@ scopeguard = "1.1.0"
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
async-trait = "0.1"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.27"
|
||||
signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
|
||||
@@ -31,7 +31,7 @@ use zenith_utils::lsn::Lsn;
|
||||
pub struct Basebackup<'a> {
|
||||
ar: Builder<&'a mut dyn Write>,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
pub lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -97,7 +97,6 @@ impl<'a> Basebackup<'a> {
|
||||
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
|
||||
// Create pgdata subdirs structure
|
||||
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
||||
info!("send subdir {:?}", *dir);
|
||||
let header = new_tar_header_dir(*dir)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// Main entry point for the Page Server executable
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use pageserver::defaults::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
@@ -12,9 +11,19 @@ use std::{
|
||||
str::FromStr,
|
||||
thread,
|
||||
};
|
||||
use tracing::*;
|
||||
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use signal_hook::consts::signal::*;
|
||||
use signal_hook::consts::TERM_SIGNALS;
|
||||
use signal_hook::flag;
|
||||
use signal_hook::iterator::exfiltrator::WithOrigin;
|
||||
use signal_hook::iterator::SignalsInfo;
|
||||
use std::process::exit;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Arc;
|
||||
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
@@ -28,6 +37,7 @@ use pageserver::{
|
||||
RelishStorageKind, S3Config, LOG_FILE_NAME,
|
||||
};
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::postgres_backend;
|
||||
|
||||
use const_format::formatcp;
|
||||
|
||||
@@ -447,7 +457,18 @@ fn main() -> Result<()> {
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
// Initialize logger
|
||||
let (_scope_guard, log_file) = logging::init(LOG_FILE_NAME, conf.daemonize)?;
|
||||
let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;
|
||||
|
||||
let term_now = Arc::new(AtomicBool::new(false));
|
||||
for sig in TERM_SIGNALS {
|
||||
// When terminated by a second term signal, exit with exit code 1.
|
||||
// This will do nothing the first time (because term_now is false).
|
||||
flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
|
||||
// But this will "arm" the above for the second time, by setting it to true.
|
||||
// The order of registering these is important, if you put this one first, it will
|
||||
// first arm and then terminate ‒ all in the first round.
|
||||
flag::register(*sig, Arc::clone(&term_now))?;
|
||||
}
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
@@ -480,7 +501,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
|
||||
match daemonize.start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(e) => error!("could not daemonize: {:#}", e),
|
||||
Err(err) => error!(%err, "could not daemonize"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -525,13 +546,42 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
|
||||
})?;
|
||||
|
||||
join_handles.push(page_service_thread);
|
||||
for info in SignalsInfo::<WithOrigin>::new(TERM_SIGNALS)?.into_iter() {
|
||||
match info.signal {
|
||||
SIGQUIT => {
|
||||
info!("Got SIGQUIT. Terminate pageserver in immediate shutdown mode");
|
||||
exit(111);
|
||||
}
|
||||
SIGTERM => {
|
||||
info!("Got SIGINT/SIGTERM. Terminate gracefully in fast shutdown mode");
|
||||
// Terminate postgres backends
|
||||
postgres_backend::set_pgbackend_shutdown_requested();
|
||||
// Stop all tenants and flush their data
|
||||
tenant_mgr::shutdown_all_tenants()?;
|
||||
// Wait for pageservice thread to complete the job
|
||||
page_service_thread
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error");
|
||||
|
||||
for handle in join_handles.into_iter() {
|
||||
handle
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error")
|
||||
// Shut down http router
|
||||
endpoint::shutdown();
|
||||
|
||||
// Wait for all threads
|
||||
for handle in join_handles.into_iter() {
|
||||
handle
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error");
|
||||
}
|
||||
info!("Pageserver shut down successfully completed");
|
||||
exit(0);
|
||||
}
|
||||
_ => {
|
||||
debug!("Unknown signal.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -14,12 +14,12 @@ use std::{
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
use tracing::*;
|
||||
|
||||
use log::*;
|
||||
use zenith_utils::crashsafe_dir;
|
||||
use zenith_utils::logging;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::tenant_mgr;
|
||||
use crate::walredo::WalRedoManager;
|
||||
@@ -100,7 +100,7 @@ pub struct PointInTime {
|
||||
pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
|
||||
// Initialize logger
|
||||
// use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
|
||||
let (_scope_guard, _log_file) = logging::init(LOG_FILE_NAME, true)?;
|
||||
let _log_file = logging::init(LOG_FILE_NAME, true)?;
|
||||
|
||||
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
|
||||
// process during repository initialization.
|
||||
@@ -176,7 +176,7 @@ fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
|
||||
// to get bootstrap data for timeline initialization.
|
||||
//
|
||||
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
info!("running initdb... ");
|
||||
info!("running initdb in {}... ", initdbpath.display());
|
||||
|
||||
let initdb_path = conf.pg_bin_dir().join("initdb");
|
||||
let initdb_output = Command::new(initdb_path)
|
||||
@@ -195,7 +195,6 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
String::from_utf8_lossy(&initdb_output.stderr)
|
||||
);
|
||||
}
|
||||
info!("initdb succeeded");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -210,6 +209,8 @@ fn bootstrap_timeline(
|
||||
tli: ZTimelineId,
|
||||
repo: &dyn Repository,
|
||||
) -> Result<()> {
|
||||
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
|
||||
|
||||
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
|
||||
|
||||
// Init temporarily repo to get bootstrap data
|
||||
@@ -218,14 +219,12 @@ fn bootstrap_timeline(
|
||||
|
||||
let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
|
||||
info!("bootstrap_timeline {:?} at lsn {}", pgdata_path, lsn);
|
||||
|
||||
// Import the contents of the data directory at the initial checkpoint
|
||||
// LSN, and any WAL after that.
|
||||
let timeline = repo.create_empty_timeline(tli)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(
|
||||
&pgdata_path,
|
||||
timeline.as_ref(),
|
||||
timeline.writer().as_ref(),
|
||||
lsn,
|
||||
)?;
|
||||
timeline.checkpoint()?;
|
||||
@@ -419,7 +418,6 @@ fn create_timeline(
|
||||
let timelinedir = conf.timeline_path(&timelineid, tenantid);
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
|
||||
|
||||
@@ -6,6 +6,7 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use routerify::{ext::RequestExt, RouterBuilder};
|
||||
use tracing::*;
|
||||
use zenith_utils::auth::JwtAuth;
|
||||
use zenith_utils::http::endpoint::attach_openapi_ui;
|
||||
use zenith_utils::http::endpoint::auth_middleware;
|
||||
@@ -98,6 +99,7 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
check_permission(&request, Some(request_data.tenant_id))?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered();
|
||||
branches::create_branch(
|
||||
get_config(&request),
|
||||
&request_data.name,
|
||||
@@ -116,6 +118,7 @@ async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
check_permission(&request, Some(tenantid))?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_list", tenant = %tenantid).entered();
|
||||
crate::branches::get_branches(get_config(&request), &tenantid)
|
||||
})
|
||||
.await
|
||||
@@ -126,11 +129,12 @@ async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
// TODO add to swagger
|
||||
async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let branch_name: &str = get_request_param(&request, "branch_name")?;
|
||||
let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
|
||||
let conf = get_state(&request).conf;
|
||||
let path = conf.branch_path(branch_name, &tenantid);
|
||||
let path = conf.branch_path(&branch_name, &tenantid);
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
BranchInfo::from_path(path, conf, &tenantid, &repo)
|
||||
})
|
||||
@@ -144,10 +148,13 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
// check for management permission
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let response_data =
|
||||
tokio::task::spawn_blocking(move || crate::branches::get_tenants(get_config(&request)))
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_list").entered();
|
||||
crate::branches::get_tenants(get_config(&request))
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
Ok(json_response(StatusCode::OK, response_data)?)
|
||||
}
|
||||
|
||||
@@ -158,6 +165,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
|
||||
tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
|
||||
})
|
||||
.await
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -45,10 +45,10 @@ use crate::layered_repository::storage_layer::{
|
||||
use crate::waldecoder;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, Result};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
@@ -57,7 +57,7 @@ use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
|
||||
use bookfile::{Book, BookWriter};
|
||||
|
||||
@@ -131,9 +131,6 @@ pub struct DeltaLayer {
|
||||
|
||||
dropped: bool,
|
||||
|
||||
/// Predecessor layer
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
|
||||
inner: Mutex<DeltaLayerInner>,
|
||||
}
|
||||
|
||||
@@ -144,10 +141,10 @@ pub struct DeltaLayerInner {
|
||||
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: BTreeMap<(u32, Lsn), BlobRange>,
|
||||
page_version_metas: VecMap<(u32, Lsn), BlobRange>,
|
||||
|
||||
/// `relsizes` tracks the size of the relation at different points in time.
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
@@ -172,29 +169,7 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
}
|
||||
.to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
fn path(&self) -> Option<PathBuf> {
|
||||
Some(Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
},
|
||||
))
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
@@ -218,10 +193,12 @@ impl Layer for DeltaLayer {
|
||||
// Scan the metadata BTreeMap backwards, starting from the given entry.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let mut iter = inner
|
||||
let iter = inner
|
||||
.page_version_metas
|
||||
.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, _entry_lsn), blob_range)) = iter.next_back() {
|
||||
.slice_range((Included(&minkey), Included(&maxkey)))
|
||||
.iter()
|
||||
.rev();
|
||||
for ((_blknum, _lsn), blob_range) in iter {
|
||||
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
|
||||
|
||||
if let Some(img) = pv.page_image {
|
||||
@@ -247,16 +224,9 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know about the predecessor layer.
|
||||
// caller know.
|
||||
if need_image {
|
||||
if let Some(cont_layer) = &self.predecessor {
|
||||
Ok(PageReconstructResult::Continue(
|
||||
self.start_lsn,
|
||||
Arc::clone(cont_layer),
|
||||
))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Missing(self.start_lsn))
|
||||
}
|
||||
Ok(PageReconstructResult::Continue(self.start_lsn))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
@@ -265,21 +235,22 @@ impl Layer for DeltaLayer {
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let inner = self.load()?;
|
||||
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
let slice = inner
|
||||
.relsizes
|
||||
.slice_range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
let result;
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
result = *entry;
|
||||
// Use the base image if needed
|
||||
} else if let Some(predecessor) = &self.predecessor {
|
||||
result = predecessor.get_seg_size(lsn)?;
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
Ok(*entry)
|
||||
} else {
|
||||
result = 0;
|
||||
Err(anyhow::anyhow!("could not find seg size in delta layer"))
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
@@ -299,17 +270,15 @@ impl Layer for DeltaLayer {
|
||||
///
|
||||
fn unload(&self) -> Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.page_version_metas = BTreeMap::new();
|
||||
inner.relsizes = BTreeMap::new();
|
||||
inner.page_version_metas = VecMap::default();
|
||||
inner.relsizes = VecMap::default();
|
||||
inner.loaded = false;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
if let Some(path) = self.path() {
|
||||
fs::remove_file(path)?;
|
||||
}
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -326,13 +295,13 @@ impl Layer for DeltaLayer {
|
||||
|
||||
println!("--- relsizes ---");
|
||||
let inner = self.load()?;
|
||||
for (k, v) in inner.relsizes.iter() {
|
||||
for (k, v) in inner.relsizes.as_slice() {
|
||||
println!(" {}: {}", k, v);
|
||||
}
|
||||
println!("--- page versions ---");
|
||||
let (_path, book) = self.open_book()?;
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.iter() {
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
|
||||
let mut desc = String::new();
|
||||
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
@@ -381,7 +350,7 @@ impl DeltaLayer {
|
||||
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
|
||||
/// expedient.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn create<'a>(
|
||||
pub fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
@@ -389,10 +358,13 @@ impl DeltaLayer {
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
page_versions: impl Iterator<Item = (&'a (u32, Lsn), &'a PageVersion)>,
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
page_versions: impl Iterator<Item = (u32, Lsn, PageVersion)>,
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
) -> Result<DeltaLayer> {
|
||||
if seg.rel.is_blocky() {
|
||||
assert!(!relsizes.is_empty());
|
||||
}
|
||||
|
||||
let delta_layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
@@ -403,17 +375,14 @@ impl DeltaLayer {
|
||||
dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: true,
|
||||
page_version_metas: BTreeMap::new(),
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes,
|
||||
}),
|
||||
predecessor,
|
||||
};
|
||||
let mut inner = delta_layer.inner.lock().unwrap();
|
||||
|
||||
// Write the in-memory btreemaps into a file
|
||||
let path = delta_layer
|
||||
.path()
|
||||
.expect("DeltaLayer is supposed to have a layer path on disk");
|
||||
let path = delta_layer.path();
|
||||
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
@@ -423,26 +392,28 @@ impl DeltaLayer {
|
||||
|
||||
let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);
|
||||
|
||||
for (key, page_version) in page_versions {
|
||||
let buf = PageVersion::ser(page_version)?;
|
||||
for (blknum, lsn, page_version) in page_versions {
|
||||
// TODO avoid deserializing and then reserializing
|
||||
let buf = PageVersion::ser(&page_version)?;
|
||||
let blob_range = page_version_writer.write_blob(&buf)?;
|
||||
|
||||
let old = inner.page_version_metas.insert(*key, blob_range);
|
||||
|
||||
assert!(old.is_none());
|
||||
inner
|
||||
.page_version_metas
|
||||
.append((blknum, lsn), blob_range)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let book = page_version_writer.close()?;
|
||||
|
||||
// Write out page versions
|
||||
let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
|
||||
let buf = BTreeMap::ser(&inner.page_version_metas)?;
|
||||
let buf = VecMap::ser(&inner.page_version_metas)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// and relsizes to separate chapter
|
||||
let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
|
||||
let buf = BTreeMap::ser(&inner.relsizes)?;
|
||||
let buf = VecMap::ser(&inner.relsizes)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
@@ -476,12 +447,7 @@ impl DeltaLayer {
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
},
|
||||
&self.layer_name(),
|
||||
);
|
||||
|
||||
let file = File::open(&path)?;
|
||||
@@ -529,10 +495,10 @@ impl DeltaLayer {
|
||||
}
|
||||
|
||||
let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
|
||||
let page_version_metas = BTreeMap::des(&chapter)?;
|
||||
let page_version_metas = VecMap::des(&chapter)?;
|
||||
|
||||
let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
|
||||
let relsizes = BTreeMap::des(&chapter)?;
|
||||
let relsizes = VecMap::des(&chapter)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
@@ -551,7 +517,6 @@ impl DeltaLayer {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
filename: &DeltaFileName,
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
) -> DeltaLayer {
|
||||
DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
@@ -563,10 +528,9 @@ impl DeltaLayer {
|
||||
dropped: filename.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes: BTreeMap::new(),
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
}),
|
||||
predecessor,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -587,10 +551,28 @@ impl DeltaLayer {
|
||||
dropped: summary.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes: BTreeMap::new(),
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
}),
|
||||
predecessor: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> DeltaFileName {
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
}
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,8 @@ use anyhow::Result;
|
||||
use log::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::METADATA_FILE_NAME;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
pub struct DeltaFileName {
|
||||
@@ -35,7 +37,7 @@ impl DeltaFileName {
|
||||
/// Parse a string as a delta file name. Returns None if the filename does not
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn from_str(fname: &str) -> Option<Self> {
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
@@ -168,7 +170,7 @@ impl ImageFileName {
|
||||
/// Parse a string as an image file name. Returns None if the filename does not
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn from_str(fname: &str) -> Option<Self> {
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
@@ -286,15 +288,11 @@ pub fn list_files(
|
||||
let fname = direntry?.file_name();
|
||||
let fname = fname.to_str().unwrap();
|
||||
|
||||
if let Some(deltafilename) = DeltaFileName::from_str(fname) {
|
||||
if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
|
||||
deltafiles.push(deltafilename);
|
||||
} else if let Some(imgfilename) = ImageFileName::from_str(fname) {
|
||||
} else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
|
||||
imgfiles.push(imgfilename);
|
||||
} else if fname == "wal"
|
||||
|| fname == "metadata"
|
||||
|| fname == "ancestor"
|
||||
|| fname.ends_with(".old")
|
||||
{
|
||||
} else if fname == METADATA_FILE_NAME || fname == "ancestor" || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
} else {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
|
||||
@@ -114,25 +114,7 @@ pub struct ImageLayerInner {
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
}
|
||||
.to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
fn path(&self) -> Option<PathBuf> {
|
||||
Some(Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
},
|
||||
))
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
@@ -222,9 +204,7 @@ impl Layer for ImageLayer {
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
if let Some(path) = self.path() {
|
||||
fs::remove_file(path)?;
|
||||
}
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -300,9 +280,7 @@ impl ImageLayer {
|
||||
let inner = layer.inner.lock().unwrap();
|
||||
|
||||
// Write the images into a file
|
||||
let path = layer
|
||||
.path()
|
||||
.expect("ImageLayer is supposed to have a layer path on disk");
|
||||
let path = layer.path();
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let file = File::create(&path)?;
|
||||
@@ -340,7 +318,7 @@ impl ImageLayer {
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
trace!("saved {}", path.display());
|
||||
|
||||
drop(inner);
|
||||
|
||||
@@ -445,15 +423,7 @@ impl ImageLayer {
|
||||
}
|
||||
|
||||
fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
|
||||
let path = Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
},
|
||||
);
|
||||
let path = self.path();
|
||||
|
||||
let file = File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
@@ -500,4 +470,21 @@ impl ImageLayer {
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> ImageFileName {
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
}
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,18 +12,17 @@ use crate::layered_repository::{DeltaLayer, ImageLayer};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, Result};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
|
||||
use zenith_utils::accum::Accum;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::page_versions::PageVersions;
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
@@ -36,57 +35,56 @@ pub struct InMemoryLayer {
|
||||
///
|
||||
start_lsn: Lsn,
|
||||
|
||||
/// Frozen in-memory layers have an inclusive end LSN.
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
/// LSN of the oldest page version stored in this layer
|
||||
oldest_pending_lsn: Lsn,
|
||||
|
||||
/// The above fields never change. The parts that do change are in 'inner',
|
||||
/// and protected by mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
|
||||
/// Predecessor layer might be needed?
|
||||
incremental: bool,
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// Frozen in-memory layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is None
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
/// If this relation was dropped, remember when that happened.
|
||||
drop_lsn: Option<Lsn>,
|
||||
/// The drop LSN is recorded in [`end_lsn`].
|
||||
dropped: bool,
|
||||
|
||||
///
|
||||
/// All versions of all pages in the layer are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
///
|
||||
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||
page_versions: PageVersions,
|
||||
|
||||
///
|
||||
/// `segsizes` tracks the size of the segment at different points in time.
|
||||
///
|
||||
segsizes: BTreeMap<Lsn, u32>,
|
||||
|
||||
/// Writes are only allowed when true.
|
||||
/// Set to false when this layer is in the process of being replaced.
|
||||
writeable: bool,
|
||||
|
||||
/// Predecessor layer
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
/// For a blocky rel, there is always one entry, at the layer's start_lsn,
|
||||
/// so that determining the size never depends on the predecessor layer. For
|
||||
/// a non-blocky rel, 'segsizes' is not used and is always empty.
|
||||
///
|
||||
segsizes: VecMap<Lsn, u32>,
|
||||
}
|
||||
|
||||
impl InMemoryLayerInner {
|
||||
fn check_writeable(&self) -> WriteResult<()> {
|
||||
if self.writeable {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(NonWriteableError)
|
||||
}
|
||||
fn assert_writeable(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
}
|
||||
|
||||
fn get_seg_size(&self, lsn: Lsn) -> u32 {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let mut iter = self.segsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
let slice = self.segsizes.slice_range(..=lsn);
|
||||
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
// We make sure there is always at least one entry
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
*entry
|
||||
} else {
|
||||
0
|
||||
panic!("could not find seg size in in-memory layer");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -98,30 +96,23 @@ impl Layer for InMemoryLayer {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn;
|
||||
let dropped;
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
if let Some(drop_lsn) = inner.end_lsn {
|
||||
end_lsn = drop_lsn;
|
||||
dropped = true;
|
||||
} else {
|
||||
end_lsn = Lsn(u64::MAX);
|
||||
dropped = false;
|
||||
}
|
||||
|
||||
let delta_filename = DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
dropped: inner.dropped,
|
||||
}
|
||||
.to_string();
|
||||
|
||||
PathBuf::from(format!("inmem-{}", delta_filename))
|
||||
}
|
||||
|
||||
fn path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
}
|
||||
@@ -135,14 +126,10 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
if let Some(end_lsn) = self.end_lsn {
|
||||
return Lsn(end_lsn.0 + 1);
|
||||
}
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
drop_lsn
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
end_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
}
|
||||
@@ -150,7 +137,7 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.drop_lsn.is_some()
|
||||
inner.dropped
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
@@ -164,18 +151,15 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
let predecessor: Option<Arc<dyn Layer>>;
|
||||
|
||||
{
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// Scan the BTreeMap backwards, starting from reconstruct_data.lsn.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let mut iter = inner
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let iter = inner
|
||||
.page_versions
|
||||
.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
|
||||
.iter_block_lsn_range(blknum, ..=lsn)
|
||||
.rev();
|
||||
for (_entry_lsn, entry) in iter {
|
||||
if let Some(img) = &entry.page_image {
|
||||
reconstruct_data.page_img = Some(img.clone());
|
||||
need_image = false;
|
||||
@@ -192,16 +176,14 @@ impl Layer for InMemoryLayer {
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
}
|
||||
|
||||
predecessor = inner.predecessor.clone();
|
||||
// release lock on 'inner'
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know about the predecessor layer.
|
||||
// caller know
|
||||
if need_image {
|
||||
if let Some(cont_layer) = predecessor {
|
||||
Ok(PageReconstructResult::Continue(self.start_lsn, cont_layer))
|
||||
if self.incremental {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Missing(self.start_lsn))
|
||||
}
|
||||
@@ -213,6 +195,10 @@ impl Layer for InMemoryLayer {
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
Ok(inner.get_seg_size(lsn))
|
||||
@@ -228,8 +214,8 @@ impl Layer for InMemoryLayer {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
// Is the requested LSN after the segment was dropped?
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
if lsn >= drop_lsn {
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
if lsn >= end_lsn {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
@@ -252,8 +238,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.predecessor.is_some()
|
||||
self.incremental
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
@@ -261,27 +246,27 @@ impl Layer for InMemoryLayer {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_str = inner
|
||||
.drop_lsn
|
||||
.end_lsn
|
||||
.as_ref()
|
||||
.map(|drop_lsn| drop_lsn.to_string())
|
||||
.map(Lsn::to_string)
|
||||
.unwrap_or_default();
|
||||
|
||||
println!(
|
||||
"----- in-memory layer for tli {} seg {} {}-{} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str
|
||||
"----- in-memory layer for tli {} seg {} {}-{} {} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
|
||||
);
|
||||
|
||||
for (k, v) in inner.segsizes.iter() {
|
||||
for (k, v) in inner.segsizes.as_slice() {
|
||||
println!("segsizes {}: {}", k, v);
|
||||
}
|
||||
|
||||
for (k, v) in inner.page_versions.iter() {
|
||||
for (blknum, lsn, pv) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
println!(
|
||||
"blk {} at {}: {}/{}\n",
|
||||
k.0,
|
||||
k.1,
|
||||
v.page_image.is_some(),
|
||||
v.record.is_some()
|
||||
blknum,
|
||||
lsn,
|
||||
pv.page_image.is_some(),
|
||||
pv.record.is_some()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -289,26 +274,19 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Write failed because the layer is in process of being replaced.
|
||||
/// See [`LayeredTimeline::perform_write_op`] for how to handle this error.
|
||||
#[derive(Debug)]
|
||||
pub struct NonWriteableError;
|
||||
/// A result of an inmemory layer data being written to disk.
|
||||
pub struct LayersOnDisk {
|
||||
pub delta_layers: Vec<DeltaLayer>,
|
||||
pub image_layers: Vec<ImageLayer>,
|
||||
}
|
||||
|
||||
pub type WriteResult<T> = std::result::Result<T, NonWriteableError>;
|
||||
|
||||
/// Helper struct to cleanup `InMemoryLayer::freeze` return signature.
|
||||
pub struct FreezeLayers {
|
||||
/// Replacement layer for the layer which freeze was called on.
|
||||
pub frozen: Arc<InMemoryLayer>,
|
||||
/// New open layer containing leftover data.
|
||||
pub open: Option<Arc<InMemoryLayer>>,
|
||||
impl LayersOnDisk {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.delta_layers.is_empty() && self.image_layers.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
fn assert_not_frozen(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
}
|
||||
|
||||
/// Return the oldest page version that's stored in this layer
|
||||
pub fn get_oldest_pending_lsn(&self) -> Lsn {
|
||||
self.oldest_pending_lsn
|
||||
@@ -332,20 +310,25 @@ impl InMemoryLayer {
|
||||
start_lsn
|
||||
);
|
||||
|
||||
// The segment is initially empty, so initialize 'segsizes' with 0.
|
||||
let mut segsizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
segsizes.append(start_lsn, 0).unwrap();
|
||||
}
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn: None,
|
||||
oldest_pending_lsn,
|
||||
incremental: false,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
segsizes: BTreeMap::new(),
|
||||
writeable: true,
|
||||
predecessor: None,
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
page_versions: PageVersions::default(),
|
||||
segsizes,
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -353,7 +336,7 @@ impl InMemoryLayer {
|
||||
// Write operations
|
||||
|
||||
/// Remember new page version, as a WAL record over previous version
|
||||
pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> WriteResult<u32> {
|
||||
pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> u32 {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
rec.lsn,
|
||||
@@ -365,7 +348,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Remember new page version, as a full page image
|
||||
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> WriteResult<u32> {
|
||||
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> u32 {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
lsn,
|
||||
@@ -378,8 +361,7 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> WriteResult<u32> {
|
||||
self.assert_not_frozen();
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> u32 {
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
trace!(
|
||||
@@ -391,9 +373,9 @@ impl InMemoryLayer {
|
||||
);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner.check_writeable()?;
|
||||
inner.assert_writeable();
|
||||
|
||||
let old = inner.page_versions.insert((blknum, lsn), pv);
|
||||
let old = inner.page_versions.append_or_update_last(blknum, lsn, pv);
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
@@ -438,7 +420,9 @@ impl InMemoryLayer {
|
||||
gapblknum,
|
||||
blknum
|
||||
);
|
||||
let old = inner.page_versions.insert((gapblknum, lsn), zeropv);
|
||||
let old = inner
|
||||
.page_versions
|
||||
.append_or_update_last(gapblknum, lsn, zeropv);
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
|
||||
if old.is_some() {
|
||||
@@ -449,49 +433,47 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
inner.segsizes.insert(lsn, newsize);
|
||||
return Ok(newsize - oldsize);
|
||||
inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
|
||||
return newsize - oldsize;
|
||||
}
|
||||
}
|
||||
Ok(0)
|
||||
|
||||
0
|
||||
}
|
||||
|
||||
/// Remember that the relation was truncated at given LSN
|
||||
pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> WriteResult<()> {
|
||||
self.assert_not_frozen();
|
||||
pub fn put_truncation(&self, lsn: Lsn, segsize: u32) {
|
||||
assert!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"put_truncation() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.check_writeable()?;
|
||||
inner.assert_writeable();
|
||||
|
||||
// check that this we truncate to a smaller size than segment was before the truncation
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
assert!(segsize < oldsize);
|
||||
|
||||
let old = inner.segsizes.insert(lsn, segsize);
|
||||
let old = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Inserting truncation, but had an entry for the LSN already");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember that the segment was dropped at given LSN
|
||||
pub fn drop_segment(&self, lsn: Lsn) -> WriteResult<()> {
|
||||
self.assert_not_frozen();
|
||||
|
||||
pub fn drop_segment(&self, lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner.check_writeable()?;
|
||||
|
||||
assert!(inner.drop_lsn.is_none());
|
||||
inner.drop_lsn = Some(lsn);
|
||||
inner.writeable = false;
|
||||
assert!(inner.end_lsn.is_none());
|
||||
assert!(!inner.dropped);
|
||||
inner.dropped = true;
|
||||
assert!(self.start_lsn < lsn);
|
||||
inner.end_lsn = Some(lsn);
|
||||
|
||||
trace!("dropped segment {} at {}", self.seg, lsn);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -518,11 +500,11 @@ impl InMemoryLayer {
|
||||
start_lsn,
|
||||
);
|
||||
|
||||
// For convenience, copy the segment size from the predecessor layer
|
||||
let mut segsizes = BTreeMap::new();
|
||||
// Copy the segment size at the start LSN from the predecessor layer.
|
||||
let mut segsizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
let size = src.get_seg_size(start_lsn)?;
|
||||
segsizes.insert(start_lsn, size);
|
||||
segsizes.append(start_lsn, size).unwrap();
|
||||
}
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
@@ -531,124 +513,43 @@ impl InMemoryLayer {
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn: None,
|
||||
oldest_pending_lsn,
|
||||
incremental: true,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
page_versions: PageVersions::default(),
|
||||
segsizes,
|
||||
writeable: true,
|
||||
predecessor: Some(src),
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_writeable(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.writeable
|
||||
inner.end_lsn.is_none()
|
||||
}
|
||||
|
||||
/// Splits `self` into two InMemoryLayers: `frozen` and `open`.
|
||||
/// All data up to and including `cutoff_lsn`
|
||||
/// is copied to `frozen`, while the remaining data is copied to `open`.
|
||||
/// After completion, self is non-writeable, but not frozen.
|
||||
pub fn freeze(self: Arc<Self>, cutoff_lsn: Lsn) -> Result<FreezeLayers> {
|
||||
info!(
|
||||
"freezing in memory layer {} on timeline {} at {} (oldest {})",
|
||||
self.filename().display(),
|
||||
self.timelineid,
|
||||
cutoff_lsn,
|
||||
self.oldest_pending_lsn
|
||||
);
|
||||
/// Make the layer non-writeable. Only call once.
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is inclusive
|
||||
pub fn freeze(&self, end_lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
self.assert_not_frozen();
|
||||
|
||||
let self_ref = self.clone();
|
||||
let mut inner = self_ref.inner.write().unwrap();
|
||||
// Dropped layers don't need any special freeze actions,
|
||||
// they are marked as non-writeable at drop and just
|
||||
// written out to disk by checkpointer.
|
||||
if inner.drop_lsn.is_some() {
|
||||
assert!(!inner.writeable);
|
||||
info!(
|
||||
"freezing in memory layer for {} on timeline {} is dropped at {}",
|
||||
self.seg,
|
||||
self.timelineid,
|
||||
inner.drop_lsn.unwrap()
|
||||
);
|
||||
|
||||
// There should be no newer layer that refers this non-writeable layer,
|
||||
// because layer that is created after dropped one represents a new rel.
|
||||
return Ok(FreezeLayers {
|
||||
frozen: self,
|
||||
open: None,
|
||||
});
|
||||
}
|
||||
assert!(inner.writeable);
|
||||
inner.writeable = false;
|
||||
|
||||
// Divide all the page versions into old and new
|
||||
// at the 'cutoff_lsn' point.
|
||||
let mut before_segsizes = BTreeMap::new();
|
||||
let mut after_segsizes = BTreeMap::new();
|
||||
let mut after_oldest_lsn: Accum<Lsn> = Accum(None);
|
||||
for (lsn, size) in inner.segsizes.iter() {
|
||||
if *lsn > cutoff_lsn {
|
||||
after_segsizes.insert(*lsn, *size);
|
||||
after_oldest_lsn.accum(min, *lsn);
|
||||
} else {
|
||||
before_segsizes.insert(*lsn, *size);
|
||||
}
|
||||
}
|
||||
|
||||
let mut before_page_versions = BTreeMap::new();
|
||||
let mut after_page_versions = BTreeMap::new();
|
||||
for ((blknum, lsn), pv) in inner.page_versions.iter() {
|
||||
if *lsn > cutoff_lsn {
|
||||
after_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
after_oldest_lsn.accum(min, *lsn);
|
||||
} else {
|
||||
before_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let frozen = Arc::new(InMemoryLayer {
|
||||
conf: self.conf,
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: Some(cutoff_lsn),
|
||||
oldest_pending_lsn: self.start_lsn,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
drop_lsn: inner.drop_lsn,
|
||||
page_versions: before_page_versions,
|
||||
segsizes: before_segsizes,
|
||||
writeable: false,
|
||||
predecessor: inner.predecessor.clone(),
|
||||
}),
|
||||
});
|
||||
|
||||
let open = if !after_segsizes.is_empty() || !after_page_versions.is_empty() {
|
||||
let mut new_open = Self::create_successor_layer(
|
||||
self.conf,
|
||||
frozen.clone(),
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
cutoff_lsn + 1,
|
||||
after_oldest_lsn.0.unwrap(),
|
||||
)?;
|
||||
|
||||
let new_inner = new_open.inner.get_mut().unwrap();
|
||||
new_inner.page_versions.append(&mut after_page_versions);
|
||||
new_inner.segsizes.append(&mut after_segsizes);
|
||||
|
||||
Some(Arc::new(new_open))
|
||||
if inner.end_lsn.is_some() {
|
||||
assert!(inner.dropped);
|
||||
} else {
|
||||
None
|
||||
};
|
||||
assert!(!inner.dropped);
|
||||
assert!(self.start_lsn < end_lsn + 1);
|
||||
inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
|
||||
|
||||
Ok(FreezeLayers { frozen, open })
|
||||
if let Some((lsn, _)) = inner.segsizes.as_slice().last() {
|
||||
assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
|
||||
}
|
||||
|
||||
for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
assert!(lsn <= end_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Write the this frozen in-memory layer to disk.
|
||||
@@ -659,16 +560,15 @@ impl InMemoryLayer {
|
||||
/// WAL records between start and end LSN. (The delta layer is not needed
|
||||
/// when a new relish is created with a single LSN, so that the start and
|
||||
/// end LSN are the same.)
|
||||
pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<LayersOnDisk> {
|
||||
trace!(
|
||||
"write_to_disk {} end_lsn is {} get_end_lsn is {}",
|
||||
"write_to_disk {} get_end_lsn is {}",
|
||||
self.filename().display(),
|
||||
self.end_lsn.unwrap_or(Lsn(0)),
|
||||
self.get_end_lsn()
|
||||
);
|
||||
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to aquire the
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
// though: another thread might have grabbed a reference to this layer
|
||||
// in `get_layer_for_write' just before the checkpointer called
|
||||
@@ -677,49 +577,45 @@ impl InMemoryLayer {
|
||||
// would have to wait until we release it. That race condition is very
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
assert!(!inner.writeable);
|
||||
let end_lsn_exclusive = inner.end_lsn.unwrap();
|
||||
|
||||
let predecessor = inner.predecessor.clone();
|
||||
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
if inner.dropped {
|
||||
let delta_layer = DeltaLayer::create(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
drop_lsn,
|
||||
end_lsn_exclusive,
|
||||
true,
|
||||
predecessor,
|
||||
inner.page_versions.iter(),
|
||||
inner.page_versions.ordered_page_version_iter(None),
|
||||
inner.segsizes.clone(),
|
||||
)?;
|
||||
trace!(
|
||||
"freeze: created delta layer for dropped segment {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
drop_lsn
|
||||
end_lsn_exclusive
|
||||
);
|
||||
return Ok(vec![Arc::new(delta_layer)]);
|
||||
return Ok(LayersOnDisk {
|
||||
delta_layers: vec![delta_layer],
|
||||
image_layers: Vec::new(),
|
||||
});
|
||||
}
|
||||
|
||||
let end_lsn = self.end_lsn.unwrap();
|
||||
// Since `end_lsn` is inclusive, subtract 1.
|
||||
// We want to make an ImageLayer for the last included LSN,
|
||||
// so the DeltaLayer should exlcude that LSN.
|
||||
let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
|
||||
|
||||
let mut before_segsizes = BTreeMap::new();
|
||||
for (lsn, size) in inner.segsizes.iter() {
|
||||
if *lsn <= end_lsn {
|
||||
before_segsizes.insert(*lsn, *size);
|
||||
}
|
||||
}
|
||||
let mut before_page_versions = inner.page_versions.iter().filter(|tup| {
|
||||
let ((_blknum, lsn), _pv) = tup;
|
||||
let mut page_versions = inner
|
||||
.page_versions
|
||||
.ordered_page_version_iter(Some(end_lsn_inclusive));
|
||||
|
||||
*lsn < end_lsn
|
||||
});
|
||||
let mut delta_layers = Vec::new();
|
||||
|
||||
let mut frozen_layers: Vec<Arc<dyn Layer>> = Vec::new();
|
||||
|
||||
if self.start_lsn != end_lsn {
|
||||
if self.start_lsn != end_lsn_inclusive {
|
||||
let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive);
|
||||
// Write the page versions before the cutoff to disk.
|
||||
let delta_layer = DeltaLayer::create(
|
||||
self.conf,
|
||||
@@ -727,35 +623,36 @@ impl InMemoryLayer {
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn,
|
||||
end_lsn_inclusive,
|
||||
false,
|
||||
predecessor,
|
||||
before_page_versions,
|
||||
before_segsizes,
|
||||
page_versions,
|
||||
segsizes,
|
||||
)?;
|
||||
frozen_layers.push(Arc::new(delta_layer));
|
||||
delta_layers.push(delta_layer);
|
||||
trace!(
|
||||
"freeze: created delta layer {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn
|
||||
end_lsn_inclusive
|
||||
);
|
||||
} else {
|
||||
assert!(before_page_versions.next().is_none());
|
||||
assert!(page_versions.next().is_none());
|
||||
}
|
||||
|
||||
drop(inner);
|
||||
|
||||
// Write a new base image layer at the cutoff point
|
||||
let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
|
||||
frozen_layers.push(Arc::new(image_layer));
|
||||
trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
|
||||
let image_layer =
|
||||
ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?;
|
||||
trace!(
|
||||
"freeze: created image layer {} at {}",
|
||||
self.seg,
|
||||
end_lsn_inclusive
|
||||
);
|
||||
|
||||
Ok(frozen_layers)
|
||||
}
|
||||
|
||||
pub fn update_predecessor(&self, predecessor: Arc<dyn Layer>) -> Option<Arc<dyn Layer>> {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.predecessor.replace(predecessor)
|
||||
Ok(LayersOnDisk {
|
||||
delta_layers,
|
||||
image_layers: vec![image_layer],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
230
pageserver/src/layered_repository/page_versions.rs
Normal file
230
pageserver/src/layered_repository/page_versions.rs
Normal file
@@ -0,0 +1,230 @@
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
ops::{Range, RangeBounds},
|
||||
slice,
|
||||
};
|
||||
|
||||
use zenith_utils::{bin_ser::LeSer, lsn::Lsn, vec_map::VecMap};
|
||||
|
||||
use super::storage_layer::PageVersion;
|
||||
|
||||
const EMPTY_SLICE: &[(Lsn, Range<usize>)] = &[];
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct PageVersions {
|
||||
heap: Vec<u8>,
|
||||
ranges: HashMap<u32, VecMap<Lsn, Range<usize>>>,
|
||||
}
|
||||
|
||||
impl PageVersions {
|
||||
pub fn append_or_update_last(
|
||||
&mut self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
page_version: PageVersion,
|
||||
) -> Option<PageVersion> {
|
||||
let mut new_bytes = PageVersion::ser(&page_version).unwrap();
|
||||
|
||||
let map = self.ranges.entry(blknum).or_insert_with(VecMap::default);
|
||||
|
||||
if let Some((last_lsn, last_range)) = map.as_slice().last() {
|
||||
if lsn == *last_lsn {
|
||||
let old_bytes = &self.heap[last_range.clone()];
|
||||
if old_bytes == new_bytes {
|
||||
return Some(page_version);
|
||||
}
|
||||
// TODO optimize for case when old_bytes.len() >= new_bytes.len()
|
||||
}
|
||||
}
|
||||
|
||||
let new_range = self.heap.len()..self.heap.len() + new_bytes.len();
|
||||
self.heap.append(&mut new_bytes);
|
||||
map.append_or_update_last(lsn, new_range)
|
||||
.unwrap()
|
||||
.map(|old_range| {
|
||||
let old_bytes = &self.heap[old_range];
|
||||
PageVersion::des(old_bytes).unwrap()
|
||||
})
|
||||
}
|
||||
|
||||
/// Get all [`PageVersion`]s in a block
|
||||
pub fn iter_block(&self, blknum: u32) -> BlockVersionIter<'_> {
|
||||
let range_iter = self
|
||||
.ranges
|
||||
.get(&blknum)
|
||||
.map(VecMap::as_slice)
|
||||
.unwrap_or(EMPTY_SLICE)
|
||||
.iter();
|
||||
|
||||
BlockVersionIter {
|
||||
heap: &self.heap,
|
||||
range_iter,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a range of [`PageVersions`] in a block
|
||||
pub fn iter_block_lsn_range<R: RangeBounds<Lsn>>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
range: R,
|
||||
) -> BlockVersionIter<'_> {
|
||||
let range_iter = self
|
||||
.ranges
|
||||
.get(&blknum)
|
||||
.map(|vec_map| vec_map.slice_range(range))
|
||||
.unwrap_or(EMPTY_SLICE)
|
||||
.iter();
|
||||
|
||||
BlockVersionIter {
|
||||
heap: &self.heap,
|
||||
range_iter,
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterate through [`PageVersion`]s in (block, lsn) order.
|
||||
/// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
|
||||
pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
|
||||
let mut ordered_blocks: Vec<u32> = self.ranges.keys().cloned().collect();
|
||||
ordered_blocks.sort_unstable();
|
||||
|
||||
let cur_block_iter = ordered_blocks
|
||||
.first()
|
||||
.map(|&blknum| self.iter_block(blknum))
|
||||
.unwrap_or_else(|| {
|
||||
let empty_iter = EMPTY_SLICE.iter();
|
||||
BlockVersionIter {
|
||||
heap: &self.heap,
|
||||
range_iter: empty_iter,
|
||||
}
|
||||
});
|
||||
|
||||
OrderedPageVersionIter {
|
||||
page_versions: self,
|
||||
ordered_blocks,
|
||||
cur_block_idx: 0,
|
||||
cutoff_lsn,
|
||||
cur_block_iter,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BlockVersionIter<'a> {
|
||||
heap: &'a Vec<u8>,
|
||||
range_iter: slice::Iter<'a, (Lsn, Range<usize>)>,
|
||||
}
|
||||
|
||||
impl BlockVersionIter<'_> {
|
||||
fn get_iter_result(&self, tuple: Option<&(Lsn, Range<usize>)>) -> Option<(Lsn, PageVersion)> {
|
||||
let (lsn, range) = tuple?;
|
||||
let range = range.clone();
|
||||
|
||||
let pv_bytes = &self.heap[range];
|
||||
let page_version = PageVersion::des(pv_bytes).unwrap();
|
||||
|
||||
Some((*lsn, page_version))
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for BlockVersionIter<'_> {
|
||||
type Item = (Lsn, PageVersion);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let tuple = self.range_iter.next();
|
||||
self.get_iter_result(tuple)
|
||||
}
|
||||
}
|
||||
|
||||
impl DoubleEndedIterator for BlockVersionIter<'_> {
|
||||
fn next_back(&mut self) -> Option<Self::Item> {
|
||||
let tuple = self.range_iter.next_back();
|
||||
self.get_iter_result(tuple)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OrderedPageVersionIter<'a> {
|
||||
page_versions: &'a PageVersions,
|
||||
|
||||
ordered_blocks: Vec<u32>,
|
||||
cur_block_idx: usize,
|
||||
|
||||
cutoff_lsn: Option<Lsn>,
|
||||
|
||||
cur_block_iter: BlockVersionIter<'a>,
|
||||
}
|
||||
|
||||
impl OrderedPageVersionIter<'_> {
|
||||
fn is_lsn_before_cutoff(&self, lsn: Lsn) -> bool {
|
||||
if let Some(cutoff_lsn) = self.cutoff_lsn.as_ref() {
|
||||
lsn < *cutoff_lsn
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for OrderedPageVersionIter<'_> {
|
||||
type Item = (u32, Lsn, PageVersion);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some((lsn, page_version)) = self.cur_block_iter.next() {
|
||||
if self.is_lsn_before_cutoff(lsn) {
|
||||
let blknum = self.ordered_blocks[self.cur_block_idx];
|
||||
return Some((blknum, lsn, page_version));
|
||||
}
|
||||
}
|
||||
|
||||
let next_block_idx = self.cur_block_idx + 1;
|
||||
let blknum: u32 = *self.ordered_blocks.get(next_block_idx)?;
|
||||
self.cur_block_idx = next_block_idx;
|
||||
self.cur_block_iter = self.page_versions.iter_block(blknum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
const EMPTY_PAGE_VERSION: PageVersion = PageVersion {
|
||||
page_image: None,
|
||||
record: None,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_ordered_iter() {
|
||||
let mut page_versions = PageVersions::default();
|
||||
const BLOCKS: u32 = 1000;
|
||||
const LSNS: u64 = 50;
|
||||
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..LSNS {
|
||||
let old = page_versions.append_or_update_last(blknum, Lsn(lsn), EMPTY_PAGE_VERSION);
|
||||
assert!(old.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
let mut iter = page_versions.ordered_page_version_iter(None);
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..LSNS {
|
||||
let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
|
||||
assert_eq!(actual_blknum, blknum);
|
||||
assert_eq!(Lsn(lsn), actual_lsn);
|
||||
}
|
||||
}
|
||||
assert!(iter.next().is_none());
|
||||
assert!(iter.next().is_none()); // should be robust against excessive next() calls
|
||||
|
||||
const CUTOFF_LSN: Lsn = Lsn(30);
|
||||
let mut iter = page_versions.ordered_page_version_iter(Some(CUTOFF_LSN));
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..CUTOFF_LSN.0 {
|
||||
let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
|
||||
assert_eq!(actual_blknum, blknum);
|
||||
assert_eq!(Lsn(lsn), actual_lsn);
|
||||
}
|
||||
}
|
||||
assert!(iter.next().is_none());
|
||||
assert!(iter.next().is_none()); // should be robust against excessive next() calls
|
||||
}
|
||||
}
|
||||
@@ -10,7 +10,6 @@ use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
@@ -87,9 +86,9 @@ pub struct PageReconstructData {
|
||||
pub enum PageReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
Complete,
|
||||
/// This layer didn't contain all the required data, the caller should collect
|
||||
/// more data from the returned predecessor layer at the returned LSN.
|
||||
Continue(Lsn, Arc<dyn Layer>),
|
||||
/// This layer didn't contain all the required data, the caller should look up
|
||||
/// the predecessor layer at the returned LSN and collect more data from there.
|
||||
Continue(Lsn),
|
||||
/// This layer didn't contain data needed to reconstruct the page version at
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
@@ -124,10 +123,6 @@ pub trait Layer: Send + Sync {
|
||||
/// Is the segment represented by this layer dropped by PostgreSQL?
|
||||
fn is_dropped(&self) -> bool;
|
||||
|
||||
/// Gets the physical location of the layer on disk.
|
||||
/// Some layers, such as in-memory, might not have the location.
|
||||
fn path(&self) -> Option<PathBuf>;
|
||||
|
||||
/// Filename used to store this layer on disk. (Even in-memory layers
|
||||
/// implement this, to print a handy unique identifier for the layer for
|
||||
/// log messages, even though they're never not on disk.)
|
||||
@@ -145,8 +140,8 @@ pub trait Layer: Send + Sync {
|
||||
///
|
||||
/// See PageReconstructResult for possible return values. The collected data
|
||||
/// is appended to reconstruct_data; the caller should pass an empty struct
|
||||
/// on first call. If this returns PageReconstructResult::Continue, call
|
||||
/// again on the returned predecessor layer with the same 'reconstruct_data'
|
||||
/// on first call. If this returns PageReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data'
|
||||
/// to collect more data.
|
||||
fn get_page_reconstruct_data(
|
||||
&self,
|
||||
|
||||
@@ -126,10 +126,6 @@ impl PageServerConf {
|
||||
self.timeline_path(timelineid, tenantid).join("ancestor")
|
||||
}
|
||||
|
||||
fn wal_dir_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.timeline_path(timelineid, tenantid).join("wal")
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
use anyhow::{anyhow, bail, ensure, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::net::TcpListener;
|
||||
use std::str;
|
||||
@@ -21,10 +20,12 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::{io, net::TcpStream};
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::auth::{self, JwtAuth};
|
||||
use zenith_utils::auth::{Claims, Scope};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::is_socket_read_timed_out;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::postgres_backend::{self, AuthType};
|
||||
use zenith_utils::pq_proto::{
|
||||
@@ -187,17 +188,32 @@ pub fn thread_main(
|
||||
listener: TcpListener,
|
||||
auth_type: AuthType,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let mut join_handles = Vec::new();
|
||||
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
let (socket, peer_addr) = listener.accept()?;
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
let local_auth = auth.clone();
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
|
||||
error!("page server thread exiting with error: {:#}", err);
|
||||
}
|
||||
});
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name("serving Page Service thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
|
||||
error!(%err, "page server thread exited with error");
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
join_handles.push(handle);
|
||||
}
|
||||
|
||||
debug!("page_service loop terminated. wait for connections to cancel");
|
||||
for handle in join_handles.into_iter() {
|
||||
handle.join().unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn page_service_conn_main(
|
||||
@@ -216,7 +232,7 @@ fn page_service_conn_main(
|
||||
}
|
||||
|
||||
let mut conn_handler = PageServerHandler::new(conf, auth);
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
@@ -260,50 +276,66 @@ impl PageServerHandler {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
/* switch client to COPYBOTH */
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
|
||||
while let Some(message) = pgb.read_message()? {
|
||||
trace!("query({:?}): {:?}", timelineid, message);
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
match pgb.read_message() {
|
||||
Ok(message) => {
|
||||
if let Some(message) = message {
|
||||
trace!("query: {:?}", message);
|
||||
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
_ => continue,
|
||||
};
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
|
||||
let response = match zenith_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_exists"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_rel_exists_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_size"])
|
||||
.observe_closure_duration(|| self.handle_get_nblocks_request(&*timeline, &req)),
|
||||
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_page_at_lsn"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(&*timeline, &req)
|
||||
}),
|
||||
};
|
||||
let response = match zenith_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_exists"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_rel_exists_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_size"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_nblocks_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_page_at_lsn"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(&*timeline, &req)
|
||||
}),
|
||||
};
|
||||
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough
|
||||
error!("error reading relation or page version: {:#}", e);
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough
|
||||
error!("error reading relation or page version: {:#}", e);
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
|
||||
pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
|
||||
pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if !is_socket_read_timed_out(&e) {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -363,6 +395,8 @@ impl PageServerHandler {
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamExistsRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
@@ -378,6 +412,7 @@ impl PageServerHandler {
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamNblocksRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
@@ -397,6 +432,8 @@ impl PageServerHandler {
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
|
||||
.entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
@@ -414,17 +451,20 @@ impl PageServerHandler {
|
||||
lsn: Option<Lsn>,
|
||||
tenantid: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
|
||||
let _enter = span.enter();
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
/* switch client to COPYOUT */
|
||||
// switch client to COPYOUT
|
||||
pgb.write_message(&BeMessage::CopyOutResponse)?;
|
||||
info!("sent CopyOut");
|
||||
|
||||
/* Send a tarball of the latest layer on the timeline */
|
||||
{
|
||||
let mut writer = CopyDataSink { pgb };
|
||||
let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
|
||||
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
||||
basebackup.send_tarball()?;
|
||||
}
|
||||
pgb.write_message(&BeMessage::CopyDone)?;
|
||||
@@ -529,11 +569,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
None
|
||||
};
|
||||
|
||||
info!(
|
||||
"got basebackup command. tenantid=\"{}\" timelineid=\"{}\" lsn=\"{:#?}\"",
|
||||
tenantid, timelineid, lsn
|
||||
);
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
@@ -551,6 +586,9 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
self.check_permission(Some(tenantid))?;
|
||||
|
||||
let _enter =
|
||||
info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
@@ -573,6 +611,9 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
self.check_permission(Some(tenantid))?;
|
||||
|
||||
let _enter =
|
||||
info_span!("branch_create", name = %branchname, tenant = %tenantid).entered();
|
||||
|
||||
let branch =
|
||||
branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
|
||||
let branch = serde_json::to_vec(&branch)?;
|
||||
|
||||
@@ -12,14 +12,12 @@ mod rust_s3;
|
||||
/// local page server layer files with external storage.
|
||||
mod synced_storage;
|
||||
|
||||
use std::path::Path;
|
||||
use std::thread;
|
||||
use std::{path::Path, thread};
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use self::local_fs::LocalFs;
|
||||
pub use self::synced_storage::schedule_timeline_upload;
|
||||
use crate::relish_storage::rust_s3::RustS3;
|
||||
use self::{local_fs::LocalFs, rust_s3::RustS3};
|
||||
use crate::{PageServerConf, RelishStorageKind};
|
||||
|
||||
pub fn run_storage_sync_thread(
|
||||
|
||||
@@ -5,9 +5,10 @@ use std::path::Path;
|
||||
use anyhow::Context;
|
||||
use s3::{bucket::Bucket, creds::Credentials, region::Region};
|
||||
|
||||
use crate::{relish_storage::strip_workspace_prefix, S3Config};
|
||||
|
||||
use super::RelishStorage;
|
||||
use crate::{
|
||||
relish_storage::{strip_workspace_prefix, RelishStorage},
|
||||
S3Config,
|
||||
};
|
||||
|
||||
const S3_FILE_SEPARATOR: char = '/';
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::time::Duration;
|
||||
use std::{collections::BinaryHeap, sync::Mutex, thread};
|
||||
|
||||
use crate::tenant_mgr;
|
||||
use crate::{relish_storage::RelishStorage, PageServerConf};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
@@ -31,22 +32,26 @@ pub fn run_storage_sync_thread<
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name("Queue based relish storage sync".to_string())
|
||||
.spawn(move || loop {
|
||||
let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
|
||||
log::debug!("Upload queue length: {}", queue_accessor.len());
|
||||
let next_task = queue_accessor.pop();
|
||||
drop(queue_accessor);
|
||||
match next_task {
|
||||
Some(task) => runtime.block_on(async {
|
||||
// suppress warnings
|
||||
let _ = (config, task, &relish_storage, max_concurrent_sync);
|
||||
todo!("omitted for brevity")
|
||||
}),
|
||||
None => {
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
continue;
|
||||
.spawn(move || {
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
|
||||
log::debug!("Upload queue length: {}", queue_accessor.len());
|
||||
let next_task = queue_accessor.pop();
|
||||
drop(queue_accessor);
|
||||
match next_task {
|
||||
Some(task) => runtime.block_on(async {
|
||||
// suppress warnings
|
||||
let _ = (config, task, &relish_storage, max_concurrent_sync);
|
||||
todo!("omitted for brevity")
|
||||
}),
|
||||
None => {
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
log::debug!("Queue based relish storage sync thread shut down");
|
||||
Ok(())
|
||||
})?;
|
||||
Ok(Some(handle))
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::ops::AddAssign;
|
||||
use std::ops::{AddAssign, Deref};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::lsn::{Lsn, RecordLsn};
|
||||
@@ -13,6 +13,8 @@ use zenith_utils::zid::ZTimelineId;
|
||||
/// A repository corresponds to one .zenith directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
pub trait Repository: Send + Sync {
|
||||
fn shutdown(&self) -> Result<()>;
|
||||
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
@@ -123,6 +125,39 @@ pub trait Timeline: Send + Sync {
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Atomically get both last and prev.
|
||||
fn get_last_record_rlsn(&self) -> RecordLsn;
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
fn get_last_record_lsn(&self) -> Lsn;
|
||||
fn get_prev_record_lsn(&self) -> Lsn;
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
/// Mutate the timeline with a [`TimelineWriter`].
|
||||
fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
|
||||
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self) -> Result<()>;
|
||||
|
||||
/// Retrieve current logical size of the timeline
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors,
|
||||
/// doesnt support TwoPhase relishes yet
|
||||
fn get_current_logical_size(&self) -> usize;
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used in tests to ensure thet incremental and non incremental variants match.
|
||||
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
|
||||
}
|
||||
|
||||
/// Various functions to mutate the timeline.
|
||||
// TODO Currently, Deref is used to allow easy access to read methods from this trait.
|
||||
// This is probably considered a bad practice in Rust and should be fixed eventually,
|
||||
// but will cause large code changes.
|
||||
pub trait TimelineWriter: Deref<Target = dyn Timeline> {
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
@@ -143,29 +178,6 @@ pub trait Timeline: Send + Sync {
|
||||
/// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
|
||||
/// Previous last record LSN is stored alongside the latest and can be read.
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn);
|
||||
/// Atomically get both last and prev.
|
||||
fn get_last_record_rlsn(&self) -> RecordLsn;
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
fn get_last_record_lsn(&self) -> Lsn;
|
||||
fn get_prev_record_lsn(&self) -> Lsn;
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self) -> Result<()>;
|
||||
|
||||
/// Retrieve current logical size of the timeline
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors,
|
||||
/// doesnt support TwoPhase relishes yet
|
||||
fn get_current_logical_size(&self) -> usize;
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used in tests to ensure thet incremental and non incremental variants match.
|
||||
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
@@ -209,7 +221,7 @@ impl WALRecord {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::layered_repository::{LayeredRepository, METADATA_FILE_NAME};
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use hex_literal::hex;
|
||||
@@ -306,14 +318,15 @@ mod tests {
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
tline.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
|
||||
tline.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
|
||||
writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
|
||||
|
||||
tline.advance_last_record_lsn(Lsn(0x50));
|
||||
writer.advance_last_record_lsn(Lsn(0x50));
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(0x50));
|
||||
|
||||
@@ -359,8 +372,8 @@ mod tests {
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x60));
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x60));
|
||||
assert_current_logical_size(&tline, Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
@@ -382,13 +395,13 @@ mod tests {
|
||||
);
|
||||
|
||||
// Truncate to zero length
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x68));
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x68));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0);
|
||||
|
||||
// Extend from 0 to 2 blocks, leaving a gap
|
||||
tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x70));
|
||||
writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x70));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE);
|
||||
assert_eq!(
|
||||
@@ -423,25 +436,26 @@ mod tests {
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x20));
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x20));
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1);
|
||||
|
||||
// Drop relish
|
||||
tline.drop_relish(TESTREL_A, Lsn(0x30))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x30));
|
||||
writer.drop_relish(TESTREL_A, Lsn(0x30))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x30));
|
||||
|
||||
// Check that rel is not visible anymore
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
|
||||
assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none());
|
||||
|
||||
// Extend it again
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x40));
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
|
||||
@@ -459,6 +473,7 @@ mod tests {
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
//from storage_layer.rs
|
||||
const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
|
||||
@@ -468,10 +483,10 @@ mod tests {
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
}
|
||||
|
||||
tline.advance_last_record_lsn(Lsn(0x20));
|
||||
writer.advance_last_record_lsn(Lsn(0x20));
|
||||
|
||||
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
|
||||
@@ -495,8 +510,8 @@ mod tests {
|
||||
|
||||
// Truncate relation so that second segment was dropped
|
||||
// - only leave one page
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x60));
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1);
|
||||
@@ -529,9 +544,9 @@ mod tests {
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x80);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
}
|
||||
tline.advance_last_record_lsn(Lsn(0x80));
|
||||
writer.advance_last_record_lsn(Lsn(0x80));
|
||||
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
|
||||
assert_eq!(
|
||||
@@ -557,14 +572,15 @@ mod tests {
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_large_rel")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
let mut lsn = 0x10;
|
||||
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||
lsn += 0x10;
|
||||
tline.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
|
||||
writer.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
|
||||
}
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
|
||||
@@ -575,8 +591,8 @@ mod tests {
|
||||
|
||||
// Truncate one block
|
||||
lsn += 0x10;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
pg_constants::RELSEG_SIZE
|
||||
@@ -585,8 +601,8 @@ mod tests {
|
||||
|
||||
// Truncate another block
|
||||
lsn += 0x10;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
pg_constants::RELSEG_SIZE - 1
|
||||
@@ -598,8 +614,8 @@ mod tests {
|
||||
let mut size: i32 = 3000;
|
||||
while size >= 0 {
|
||||
lsn += 0x10;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
size as u32
|
||||
@@ -619,16 +635,17 @@ mod tests {
|
||||
fn test_list_rels_drop() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_list_rels_drop")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
const TESTDB: u32 = 111;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
|
||||
tline.advance_last_record_lsn(Lsn(0x30));
|
||||
writer.advance_last_record_lsn(Lsn(0x30));
|
||||
|
||||
// Check that list_rels() lists it after LSN 2, but no before it
|
||||
assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A));
|
||||
@@ -638,14 +655,17 @@ mod tests {
|
||||
// Create a branch, check that the relation is visible there
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let new_writer = newtline.writer();
|
||||
|
||||
assert!(newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x30))?
|
||||
.contains(&TESTREL_A));
|
||||
|
||||
// Drop it on the branch
|
||||
newtline.drop_relish(TESTREL_A, Lsn(0x40))?;
|
||||
newtline.advance_last_record_lsn(Lsn(0x40));
|
||||
new_writer.drop_relish(TESTREL_A, Lsn(0x40))?;
|
||||
new_writer.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
drop(new_writer);
|
||||
|
||||
// Check that it's no longer listed on the branch after the point where it was dropped
|
||||
assert!(newtline
|
||||
@@ -673,28 +693,30 @@ mod tests {
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_branch")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
|
||||
// Create another relation
|
||||
tline.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
|
||||
|
||||
tline.advance_last_record_lsn(Lsn(0x40));
|
||||
writer.advance_last_record_lsn(Lsn(0x40));
|
||||
assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let new_writer = newtline.writer();
|
||||
|
||||
newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
newtline.advance_last_record_lsn(Lsn(0x40));
|
||||
new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
new_writer.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
// Check page contents on both branches
|
||||
assert_eq!(
|
||||
@@ -728,7 +750,7 @@ mod tests {
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join("metadata");
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
|
||||
|
||||
assert!(metadata_path.is_file());
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! zenith Timeline.
|
||||
//!
|
||||
use log::*;
|
||||
use postgres_ffi::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use std::cmp::min;
|
||||
@@ -13,6 +12,7 @@ use std::path::Path;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::{Buf, Bytes};
|
||||
use tracing::*;
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::*;
|
||||
@@ -34,7 +34,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
///
|
||||
pub fn import_timeline_from_postgres_datadir(
|
||||
path: &Path,
|
||||
timeline: &dyn Timeline,
|
||||
writer: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
// Scan 'global'
|
||||
@@ -44,10 +44,10 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
None => continue,
|
||||
|
||||
Some("pg_control") => {
|
||||
import_control_file(timeline, lsn, &direntry.path())?;
|
||||
import_control_file(writer, lsn, &direntry.path())?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
writer,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
@@ -59,7 +59,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
timeline,
|
||||
writer,
|
||||
lsn,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
@@ -86,7 +86,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
writer,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
@@ -98,7 +98,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
timeline,
|
||||
writer,
|
||||
lsn,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
@@ -108,24 +108,24 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(timeline, lsn, SlruKind::Clog, &entry.path())?;
|
||||
import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(timeline, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(timeline, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
|
||||
import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -133,12 +133,13 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_relfile(
|
||||
path: &Path,
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
) -> Result<()> {
|
||||
// Does it look like a relation file?
|
||||
trace!("importing rel file {}", path.display());
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
if let Err(e) = p {
|
||||
@@ -166,14 +167,14 @@ fn import_relfile(
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
Err(err) => match err.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
bail!("error reading file {}: {:#}", path.display(), e);
|
||||
bail!("error reading file {}: {:#}", path.display(), err);
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -190,7 +191,7 @@ fn import_relfile(
|
||||
/// are just slurped into the repository as one blob.
|
||||
///
|
||||
fn import_nonrel_file(
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
tag: RelishTag,
|
||||
path: &Path,
|
||||
@@ -200,7 +201,7 @@ fn import_nonrel_file(
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
info!("importing non-rel file {}", path.display());
|
||||
trace!("importing non-rel file {}", path.display());
|
||||
|
||||
timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
@@ -211,13 +212,13 @@ fn import_nonrel_file(
|
||||
///
|
||||
/// The control file is imported as is, but we also extract the checkpoint record
|
||||
/// from it and store it separated.
|
||||
fn import_control_file(timeline: &dyn Timeline, lsn: Lsn, path: &Path) -> Result<()> {
|
||||
fn import_control_file(timeline: &dyn TimelineWriter, lsn: Lsn, path: &Path) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
info!("importing control file {}", path.display());
|
||||
trace!("importing control file {}", path.display());
|
||||
|
||||
// Import it as ControlFile
|
||||
timeline.put_page_image(
|
||||
@@ -238,13 +239,18 @@ fn import_control_file(timeline: &dyn Timeline, lsn: Lsn, path: &Path) -> Result
|
||||
///
|
||||
/// Import an SLRU segment file
|
||||
///
|
||||
fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Path) -> Result<()> {
|
||||
fn import_slru_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
slru: SlruKind,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
// Does it look like an SLRU file?
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
|
||||
info!("importing slru file {}", path.display());
|
||||
trace!("importing slru file {}", path.display());
|
||||
|
||||
let mut rpageno = 0;
|
||||
loop {
|
||||
@@ -260,14 +266,14 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
Err(err) => match err.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
bail!("error reading file {}: {:#}", path.display(), e);
|
||||
bail!("error reading file {}: {:#}", path.display(), err);
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -285,12 +291,15 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa
|
||||
///
|
||||
pub fn save_decoded_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
decoded: &DecodedWALRecord,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
checkpoint.update_next_xid(decoded.xl_xid);
|
||||
if checkpoint.update_next_xid(decoded.xl_xid) {
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
@@ -374,7 +383,7 @@ pub fn save_decoded_record(
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
let xlrec = XlClogTruncate::decode(&mut buf);
|
||||
save_clog_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
save_clog_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
@@ -443,10 +452,17 @@ pub fn save_decoded_record(
|
||||
)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
|
||||
save_multixact_create_record(
|
||||
checkpoint,
|
||||
checkpoint_modified,
|
||||
timeline,
|
||||
lsn,
|
||||
&xlrec,
|
||||
decoded,
|
||||
)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||
save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
save_multixact_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
@@ -455,7 +471,10 @@ pub fn save_decoded_record(
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
checkpoint.nextOid = next_oid;
|
||||
if checkpoint.nextOid != next_oid {
|
||||
checkpoint.nextOid = next_oid;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
@@ -471,6 +490,7 @@ pub fn save_decoded_record(
|
||||
);
|
||||
if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
|
||||
checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -478,7 +498,11 @@ pub fn save_decoded_record(
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record.
|
||||
fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> {
|
||||
fn save_xlog_dbase_create(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
rec: &XlCreateDatabase,
|
||||
) -> Result<()> {
|
||||
let db_id = rec.db_id;
|
||||
let tablespace_id = rec.tablespace_id;
|
||||
let src_db_id = rec.src_db_id;
|
||||
@@ -555,7 +579,11 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record.
|
||||
///
|
||||
/// This is the same logic as in PostgreSQL's smgr_redo() function.
|
||||
fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> {
|
||||
fn save_xlog_smgr_truncate(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
rec: &XlSmgrTruncate,
|
||||
) -> Result<()> {
|
||||
let spcnode = rec.rnode.spcnode;
|
||||
let dbnode = rec.rnode.dbnode;
|
||||
let relnode = rec.rnode.relnode;
|
||||
@@ -617,7 +645,7 @@ fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTrunca
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
|
||||
///
|
||||
fn save_xact_record(
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
parsed: &XlXactParsedRecord,
|
||||
decoded: &DecodedWALRecord,
|
||||
@@ -674,7 +702,8 @@ fn save_xact_record(
|
||||
|
||||
fn save_clog_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlClogTruncate,
|
||||
) -> Result<()> {
|
||||
@@ -692,6 +721,7 @@ fn save_clog_truncate_record(
|
||||
// TODO Figure out if there will be any issues with replica.
|
||||
checkpoint.oldestXid = xlrec.oldest_xid;
|
||||
checkpoint.oldestXidDB = xlrec.oldest_xid_db;
|
||||
*checkpoint_modified = true;
|
||||
|
||||
// TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it
|
||||
|
||||
@@ -734,7 +764,8 @@ fn save_clog_truncate_record(
|
||||
|
||||
fn save_multixact_create_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
decoded: &DecodedWALRecord,
|
||||
@@ -790,9 +821,11 @@ fn save_multixact_create_record(
|
||||
}
|
||||
if xlrec.mid >= checkpoint.nextMulti {
|
||||
checkpoint.nextMulti = xlrec.mid + 1;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
|
||||
checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
|
||||
if mbr.xid.wrapping_sub(acc) as i32 > 0 {
|
||||
@@ -802,18 +835,22 @@ fn save_multixact_create_record(
|
||||
}
|
||||
});
|
||||
|
||||
checkpoint.update_next_xid(max_mbr_xid);
|
||||
if checkpoint.update_next_xid(max_mbr_xid) {
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
) -> Result<()> {
|
||||
checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
*checkpoint_modified = true;
|
||||
|
||||
// PerformMembersTruncation
|
||||
let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
|
||||
@@ -847,7 +884,7 @@ fn save_multixact_truncate_record(
|
||||
}
|
||||
|
||||
fn save_relmap_page(
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlRelmapUpdate,
|
||||
decoded: &DecodedWALRecord,
|
||||
|
||||
@@ -8,12 +8,14 @@ use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::info;
|
||||
use log::{debug, info};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::thread::JoinHandle;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
lazy_static! {
|
||||
@@ -24,6 +26,19 @@ lazy_static! {
|
||||
fn access_repository() -> MutexGuard<'static, HashMap<ZTenantId, Arc<dyn Repository>>> {
|
||||
REPOSITORY.lock().unwrap()
|
||||
}
|
||||
struct TenantHandleEntry {
|
||||
checkpointer_handle: Option<JoinHandle<()>>,
|
||||
gc_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
// Logically these handles belong to Repository,
|
||||
// but it's just simpler to store them separately
|
||||
lazy_static! {
|
||||
static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = access_repository();
|
||||
@@ -47,8 +62,18 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Arc<Layered
|
||||
tenant_id,
|
||||
true,
|
||||
));
|
||||
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
LayeredRepository::launch_gc_thread(conf, repo.clone());
|
||||
|
||||
let checkpointer_handle = LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
let gc_handle = LayeredRepository::launch_gc_thread(conf, repo.clone());
|
||||
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
let h = TenantHandleEntry {
|
||||
checkpointer_handle: Some(checkpointer_handle),
|
||||
gc_handle: Some(gc_handle),
|
||||
};
|
||||
|
||||
handles.insert(tenant_id, h);
|
||||
|
||||
repo
|
||||
}
|
||||
|
||||
@@ -82,6 +107,35 @@ fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
|
||||
}
|
||||
}
|
||||
|
||||
// Check this flag in the thread loops to know when to exit
|
||||
pub fn shutdown_requested() -> bool {
|
||||
SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn stop_tenant_threads(tenantid: ZTenantId) {
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
if let Some(h) = handles.get_mut(&tenantid) {
|
||||
h.checkpointer_handle.take().map(JoinHandle::join);
|
||||
debug!("checkpointer for tenant {} has stopped", tenantid);
|
||||
h.gc_handle.take().map(JoinHandle::join);
|
||||
debug!("gc for tenant {} has stopped", tenantid);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shutdown_all_tenants() -> Result<()> {
|
||||
SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
|
||||
|
||||
let tenants = list_tenants()?;
|
||||
for tenantid in tenants {
|
||||
stop_tenant_threads(tenantid);
|
||||
let repo = get_repository_for_tenant(tenantid)?;
|
||||
debug!("shutdown tenant {}", tenantid);
|
||||
repo.shutdown()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn create_repository_for_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
@@ -100,10 +154,6 @@ pub fn create_repository_for_tenant(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
|
||||
access_repository().insert(tenantid, repo);
|
||||
}
|
||||
|
||||
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
|
||||
access_repository()
|
||||
.get(&tenantid)
|
||||
@@ -119,3 +169,14 @@ pub fn get_timeline_for_tenant(
|
||||
.get_timeline(timelineid)
|
||||
.with_context(|| format!("cannot fetch timeline {}", timelineid))
|
||||
}
|
||||
|
||||
fn list_tenants() -> Result<Vec<ZTenantId>> {
|
||||
let o = &mut REPOSITORY.lock().unwrap();
|
||||
|
||||
o.iter()
|
||||
.map(|tenant| {
|
||||
let (tenantid, _) = tenant;
|
||||
Ok(*tenantid)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -12,24 +12,22 @@ use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{bail, Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
use postgres::replication::ReplicationIter;
|
||||
use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::cell::Cell;
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use std::thread::JoinHandle;
|
||||
use std::thread_local;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use tracing::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
@@ -39,6 +37,7 @@ use zenith_utils::zid::ZTimelineId;
|
||||
//
|
||||
struct WalReceiverEntry {
|
||||
wal_producer_connstr: String,
|
||||
wal_receiver_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -53,6 +52,19 @@ thread_local! {
|
||||
pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
|
||||
}
|
||||
|
||||
// Wait for walreceiver to stop
|
||||
// Now it stops when pageserver shutdown is requested.
|
||||
// In future we can make this more granular and send shutdown signals
|
||||
// per tenant/timeline to cancel inactive walreceivers.
|
||||
// TODO deal with blocking pg connections
|
||||
pub fn stop_wal_receiver(timelineid: ZTimelineId) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
if let Some(r) = receivers.get_mut(&timelineid) {
|
||||
r.wal_receiver_handle.take();
|
||||
// r.wal_receiver_handle.take().map(JoinHandle::join);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -67,19 +79,19 @@ pub fn launch_wal_receiver(
|
||||
receiver.wal_producer_connstr = wal_producer_connstr.into();
|
||||
}
|
||||
None => {
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Also launch a new thread to handle this connection
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
let wal_receiver_handle = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
IS_WAL_RECEIVER.with(|c| c.set(true));
|
||||
thread_main(conf, timelineid, tenantid);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
wal_receiver_handle: Some(wal_receiver_handle),
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -99,16 +111,14 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
|
||||
// This is the entry point for the WAL receiver thread.
|
||||
//
|
||||
fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
|
||||
info!(
|
||||
"WAL receiver thread started for timeline : '{}'",
|
||||
timelineid
|
||||
);
|
||||
let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
info!("WAL receiver thread started");
|
||||
|
||||
//
|
||||
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
|
||||
// and start streaming WAL from it. If the connection is lost, keep retrying.
|
||||
//
|
||||
loop {
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
|
||||
@@ -122,10 +132,11 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
|
||||
sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
debug!("WAL streaming shut down");
|
||||
}
|
||||
|
||||
fn walreceiver_main(
|
||||
conf: &PageServerConf,
|
||||
_conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
tenantid: ZTenantId,
|
||||
@@ -172,8 +183,8 @@ fn walreceiver_main(
|
||||
startpoint += startpoint.calc_padding(8u32);
|
||||
|
||||
info!(
|
||||
"last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, timelineid, end_of_wal
|
||||
"last_record_lsn {} starting replication from {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, end_of_wal
|
||||
);
|
||||
|
||||
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
|
||||
@@ -195,34 +206,38 @@ fn walreceiver_main(
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
let prev_last_rec_lsn = last_rec_lsn;
|
||||
|
||||
trace!("received XLogData between {} and {}", startlsn, endlsn);
|
||||
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// Save old checkpoint value to compare with it after decoding WAL record
|
||||
let old_checkpoint_bytes = checkpoint.encode();
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
let _enter = info_span!("processing record", lsn = %lsn).entered();
|
||||
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hittind a deadlock.
|
||||
assert!(lsn.is_aligned());
|
||||
|
||||
let writer = timeline.writer();
|
||||
|
||||
let mut checkpoint_modified = false;
|
||||
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
restore_local_repo::save_decoded_record(
|
||||
&mut checkpoint,
|
||||
&*timeline,
|
||||
&mut checkpoint_modified,
|
||||
writer.as_ref(),
|
||||
&decoded,
|
||||
recdata,
|
||||
lsn,
|
||||
)?;
|
||||
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
// Check if checkpoint data was updated by save_decoded_record
|
||||
if new_checkpoint_bytes != old_checkpoint_bytes {
|
||||
timeline.put_page_image(
|
||||
if checkpoint_modified {
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
|
||||
writer.put_page_image(
|
||||
RelishTag::Checkpoint,
|
||||
0,
|
||||
lsn,
|
||||
@@ -232,38 +247,10 @@ fn walreceiver_main(
|
||||
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
last_rec_lsn = lsn;
|
||||
}
|
||||
|
||||
// Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
|
||||
// "checkpoint" the repository to flush all the changes from WAL we've processed
|
||||
// so far to disk. After this, we don't need the original WAL anymore, and it
|
||||
// can be removed. This is probably too aggressive for production, but it's useful
|
||||
// to expose bugs now.
|
||||
//
|
||||
// TODO: We don't actually dare to remove the WAL. It's useful for debugging,
|
||||
// and we might it for logical decoding other things in the future. Although
|
||||
// we should also be able to fetch it back from the WAL safekeepers or S3 if
|
||||
// needed.
|
||||
if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
!= last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
{
|
||||
info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
|
||||
let (oldest_segno, newest_segno) = find_wal_file_range(
|
||||
conf,
|
||||
&timelineid,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
last_rec_lsn,
|
||||
&tenantid,
|
||||
)?;
|
||||
|
||||
if newest_segno - oldest_segno >= 10 {
|
||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {}", endlsn);
|
||||
caught_up = true;
|
||||
@@ -285,7 +272,7 @@ fn walreceiver_main(
|
||||
);
|
||||
|
||||
if reply_requested {
|
||||
Some(timeline.get_last_record_lsn())
|
||||
Some(last_rec_lsn)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -305,51 +292,15 @@ fn walreceiver_main(
|
||||
|
||||
physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
|
||||
}
|
||||
|
||||
if tenant_mgr::shutdown_requested() {
|
||||
debug!("stop walreceiver because pageserver shutdown is requested");
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_wal_file_range(
|
||||
conf: &PageServerConf,
|
||||
timeline: &ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
written_upto: Lsn,
|
||||
tenant: &ZTenantId,
|
||||
) -> Result<(u64, u64)> {
|
||||
let written_upto_segno = written_upto.segment_number(wal_seg_size);
|
||||
|
||||
let mut oldest_segno = written_upto_segno;
|
||||
let mut newest_segno = written_upto_segno;
|
||||
// Scan the wal directory, and count how many WAL filed we could remove
|
||||
let wal_dir = conf.wal_dir_path(timeline, tenant);
|
||||
for entry in fs::read_dir(wal_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let filename = path.file_name().unwrap().to_str().unwrap();
|
||||
|
||||
if IsXLogFileName(filename) {
|
||||
let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
|
||||
|
||||
if segno > written_upto_segno {
|
||||
// that's strange.
|
||||
warn!("there is a WAL file from future at {}", path.display());
|
||||
continue;
|
||||
}
|
||||
|
||||
oldest_segno = min(oldest_segno, segno);
|
||||
newest_segno = max(newest_segno, segno);
|
||||
}
|
||||
}
|
||||
// FIXME: would be good to assert that there are no gaps in the WAL files
|
||||
|
||||
Ok((oldest_segno, newest_segno))
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
///
|
||||
/// See the [postgres docs] for more details.
|
||||
|
||||
@@ -565,22 +565,16 @@ impl PostgresRedoProcess {
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
if base_img.is_some() {
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
)
|
||||
.await??;
|
||||
if let Some(img) = base_img {
|
||||
timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??;
|
||||
}
|
||||
|
||||
// Send WAL records.
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.write_all(&build_apply_record_msg(rec.lsn, &rec.rec))
|
||||
.await?;
|
||||
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
@@ -617,58 +611,41 @@ impl PostgresRedoProcess {
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec<u8> {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
|
||||
// FIXME: this is a temporary hack that should go away when we refactor
|
||||
// the postgres protocol serialization + handlers.
|
||||
//
|
||||
// BytesMut is a dynamic growable buffer, used a lot in tokio code but
|
||||
// not in the std library. To write to a BytesMut from a serde serializer,
|
||||
// we need to either:
|
||||
// - pre-allocate the required buffer space. This is annoying because we
|
||||
// shouldn't care what the exact serialized size is-- that's the
|
||||
// serializer's job.
|
||||
// - Or, we need to create a temporary "writer" (which implements the
|
||||
// `Write` trait). It's a bit awkward, because the writer consumes the
|
||||
// underlying BytesMut, and we need to extract it later with
|
||||
// `into_inner`.
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
tag.ser_into(&mut buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
buf
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec<u8> {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
tag.ser_into(&mut buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let mut buf = writer.into_inner();
|
||||
buf.put(base_img);
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
buf
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
let mut buf: Vec<u8> = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
@@ -677,21 +654,19 @@ fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
buf
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
fn build_get_page_msg(tag: BufferTag) -> Vec<u8> {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
tag.ser_into(&mut buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
buf
|
||||
}
|
||||
|
||||
@@ -377,10 +377,12 @@ impl CheckPoint {
|
||||
Ok(CheckPoint::des(buf)?)
|
||||
}
|
||||
|
||||
// Update next XID based on provided new_xid and stored epoch.
|
||||
// Next XID should be greater than new_xid.
|
||||
// Also take in account 32-bit wrap-around.
|
||||
pub fn update_next_xid(&mut self, xid: u32) {
|
||||
/// Update next XID based on provided new_xid and stored epoch.
|
||||
/// Next XID should be greater than new_xid. This handles 32-bit
|
||||
/// XID wraparound correctly.
|
||||
///
|
||||
/// Returns 'true' if the XID was updated.
|
||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
|
||||
let full_xid = self.nextXid.value;
|
||||
let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
@@ -391,10 +393,14 @@ impl CheckPoint {
|
||||
// wrap-around
|
||||
epoch += 1;
|
||||
}
|
||||
self.nextXid = FullTransactionId {
|
||||
value: (epoch << 32) | new_xid as u64,
|
||||
};
|
||||
let nextXid = (epoch << 32) | new_xid as u64;
|
||||
|
||||
if nextXid != self.nextXid.value {
|
||||
self.nextXid = FullTransactionId { value: nextXid };
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow:
|
||||
|
||||
pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
let mut conn_handler = MgmtHandler { state };
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
|
||||
@@ -64,6 +64,7 @@ pub fn proxy_conn_main(
|
||||
socket,
|
||||
postgres_backend::AuthType::MD5,
|
||||
state.conf.ssl_config.clone(),
|
||||
false,
|
||||
)?,
|
||||
md5_salt: [0u8; 4],
|
||||
psql_session_id: "".into(),
|
||||
|
||||
@@ -33,11 +33,11 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
FROM generate_series(1, 200000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
print('LSN after 100100 rows: ' + lsn_b)
|
||||
print('LSN after 200100 rows: ' + lsn_b)
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
|
||||
@@ -46,15 +46,15 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
FROM generate_series(1, 200000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
print('LSN after 200100 rows: ' + lsn_c)
|
||||
print('LSN after 400100 rows: ' + lsn_c)
|
||||
|
||||
# Branch at the point where only 200 rows were inserted
|
||||
# Branch at the point where only 200100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
|
||||
|
||||
pg_hundred = postgres.create_start("test_branch_behind_hundred")
|
||||
@@ -70,11 +70,11 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
more_pg_conn = pg_more.connect()
|
||||
more_cur = more_pg_conn.cursor()
|
||||
more_cur.execute('SELECT count(*) FROM foo')
|
||||
assert more_cur.fetchone() == (100100, )
|
||||
assert more_cur.fetchone() == (200100, )
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (200100, )
|
||||
assert main_cur.fetchone() == (400100, )
|
||||
|
||||
# Check bad lsn's for branching
|
||||
|
||||
|
||||
@@ -140,6 +140,30 @@ class ZenithBenchmarker:
|
||||
re.MULTILINE)
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
def get_peak_mem(self, pageserver) -> int:
|
||||
"""
|
||||
Fetch the "maxrss" metric from the pageserver
|
||||
"""
|
||||
# Fetch all the exposed prometheus metrics from page server
|
||||
all_metrics = pageserver.http_client().get_metrics()
|
||||
# See comment in get_io_writes()
|
||||
matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics,
|
||||
re.MULTILINE)
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):
|
||||
"""
|
||||
Calculate the on-disk size of a timeline
|
||||
"""
|
||||
path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
|
||||
|
||||
totalbytes = 0
|
||||
for root, dirs, files in os.walk(path):
|
||||
for name in files:
|
||||
totalbytes += os.path.getsize(os.path.join(root, name))
|
||||
|
||||
return totalbytes
|
||||
|
||||
@contextmanager
|
||||
def record_pageserver_writes(self, pageserver, metric_name):
|
||||
"""
|
||||
|
||||
@@ -64,7 +64,7 @@ def pytest_configure(config):
|
||||
raise Exception('Too many workers configured. Cannot distrubute ports for services.')
|
||||
|
||||
# does not use -c as it is not supported on macOS
|
||||
cmd = ['pgrep', 'pageserver|postgres|wal_acceptor']
|
||||
cmd = ['pgrep', 'pageserver|postgres|safekeeper']
|
||||
result = subprocess.run(cmd, stdout=subprocess.DEVNULL)
|
||||
if result.returncode == 0:
|
||||
# returncode of 0 means it found something.
|
||||
@@ -72,7 +72,7 @@ def pytest_configure(config):
|
||||
# result of the test.
|
||||
# NOTE this shows as an internal pytest error, there might be a better way
|
||||
raise Exception(
|
||||
'Found interfering processes running. Stop all Zenith pageservers, nodes, WALs, as well as stand-alone Postgres.'
|
||||
'Found interfering processes running. Stop all Zenith pageservers, nodes, safekeepers, as well as stand-alone Postgres.'
|
||||
)
|
||||
|
||||
|
||||
@@ -375,6 +375,7 @@ class ZenithPageserver(PgProtocol):
|
||||
Start the page server.
|
||||
Returns self.
|
||||
"""
|
||||
assert self.running == False
|
||||
|
||||
self.zenith_cli.run(['start'])
|
||||
self.running = True
|
||||
@@ -382,14 +383,18 @@ class ZenithPageserver(PgProtocol):
|
||||
self.initial_tenant = self.zenith_cli.run(['tenant', 'list']).stdout.strip()
|
||||
return self
|
||||
|
||||
def stop(self) -> 'ZenithPageserver':
|
||||
def stop(self, immediate=False) -> 'ZenithPageserver':
|
||||
"""
|
||||
Stop the page server.
|
||||
Returns self.
|
||||
"""
|
||||
cmd = ['stop']
|
||||
if immediate:
|
||||
cmd.append('immediate')
|
||||
|
||||
print(cmd)
|
||||
if self.running:
|
||||
self.zenith_cli.run(['stop'])
|
||||
self.zenith_cli.run(cmd)
|
||||
self.running = False
|
||||
|
||||
return self
|
||||
@@ -398,7 +403,7 @@ class ZenithPageserver(PgProtocol):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
self.stop()
|
||||
self.stop(True)
|
||||
|
||||
@cached_property
|
||||
def auth_keys(self) -> AuthKeys:
|
||||
@@ -444,7 +449,7 @@ def pageserver(zenith_cli: ZenithCli, repo_dir: str, pageserver_port: Pageserver
|
||||
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting pageserver cleanup')
|
||||
ps.stop()
|
||||
ps.stop(True)
|
||||
|
||||
class PgBin:
|
||||
""" A helper class for executing postgres binaries """
|
||||
@@ -794,12 +799,17 @@ def read_pid(path: Path):
|
||||
return int(path.read_text())
|
||||
|
||||
|
||||
@dataclass
|
||||
class WalAcceptorPort:
|
||||
pg: int
|
||||
http: int
|
||||
|
||||
@dataclass
|
||||
class WalAcceptor:
|
||||
""" An object representing a running wal acceptor daemon. """
|
||||
wa_bin_path: Path
|
||||
data_dir: Path
|
||||
port: int
|
||||
port: WalAcceptorPort
|
||||
num: int # identifier for logging
|
||||
pageserver_port: int
|
||||
auth_token: Optional[str] = None
|
||||
@@ -811,7 +821,8 @@ class WalAcceptor:
|
||||
|
||||
cmd = [str(self.wa_bin_path)]
|
||||
cmd.extend(["-D", str(self.data_dir)])
|
||||
cmd.extend(["-l", f"localhost:{self.port}"])
|
||||
cmd.extend(["--listen-pg", f"localhost:{self.port.pg}"])
|
||||
cmd.extend(["--listen-http", f"localhost:{self.port.http}"])
|
||||
cmd.append("--daemonize")
|
||||
cmd.append("--no-sync")
|
||||
# Tell page server it can receive WAL from this WAL safekeeper
|
||||
@@ -832,7 +843,7 @@ class WalAcceptor:
|
||||
|
||||
@property
|
||||
def pidfile(self) -> Path:
|
||||
return self.data_dir / "wal_acceptor.pid"
|
||||
return self.data_dir / "safekeeper.pid"
|
||||
|
||||
def get_pid(self) -> Optional[int]:
|
||||
if not self.pidfile.exists():
|
||||
@@ -868,14 +879,14 @@ class WalAcceptor:
|
||||
|
||||
# "replication=0" hacks psycopg not to send additional queries
|
||||
# on startup, see https://github.com/psycopg/psycopg2/pull/482
|
||||
connstr = f"host=localhost port={self.port} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'"
|
||||
connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'"
|
||||
|
||||
with closing(psycopg2.connect(connstr)) as conn:
|
||||
# server doesn't support transactions
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
request_json = json.dumps(request)
|
||||
print(f"JSON_CTRL request on port {self.port}: {request_json}")
|
||||
print(f"JSON_CTRL request on port {self.port.pg}: {request_json}")
|
||||
cur.execute("JSON_CTRL " + request_json)
|
||||
all = cur.fetchall()
|
||||
print(f"JSON_CTRL response: {all[0][0]}")
|
||||
@@ -884,7 +895,7 @@ class WalAcceptor:
|
||||
class WalAcceptorFactory:
|
||||
""" An object representing multiple running wal acceptors. """
|
||||
def __init__(self, zenith_binpath: Path, data_dir: Path, pageserver_port: int, port_distributor: PortDistributor):
|
||||
self.wa_bin_path = zenith_binpath / 'wal_acceptor'
|
||||
self.wa_bin_path = zenith_binpath / 'safekeeper'
|
||||
self.data_dir = data_dir
|
||||
self.instances: List[WalAcceptor] = []
|
||||
self.port_distributor = port_distributor
|
||||
@@ -898,7 +909,10 @@ class WalAcceptorFactory:
|
||||
wa = WalAcceptor(
|
||||
wa_bin_path=self.wa_bin_path,
|
||||
data_dir=self.data_dir / "wal_acceptor_{}".format(wa_num),
|
||||
port=self.port_distributor.get_port(),
|
||||
port=WalAcceptorPort(
|
||||
pg=self.port_distributor.get_port(),
|
||||
http=self.port_distributor.get_port(),
|
||||
),
|
||||
num=wa_num,
|
||||
pageserver_port=self.pageserver_port,
|
||||
auth_token=auth_token,
|
||||
@@ -922,7 +936,7 @@ class WalAcceptorFactory:
|
||||
|
||||
def get_connstrs(self) -> str:
|
||||
""" Get list of wal acceptor endpoints suitable for wal_acceptors GUC """
|
||||
return ','.join(["localhost:{}".format(wa.port) for wa in self.instances])
|
||||
return ','.join(["localhost:{}".format(wa.port.pg) for wa in self.instances])
|
||||
|
||||
|
||||
@zenfixture
|
||||
|
||||
@@ -4,19 +4,6 @@ from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
|
||||
path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
|
||||
|
||||
totalbytes = 0
|
||||
for root, dirs, files in os.walk(path):
|
||||
for name in files:
|
||||
totalbytes += os.path.getsize(os.path.join(root, name))
|
||||
|
||||
if 'wal' in dirs:
|
||||
dirs.remove('wal') # don't visit 'wal' subdirectory
|
||||
|
||||
return totalbytes
|
||||
|
||||
#
|
||||
# Run bulk INSERT test.
|
||||
#
|
||||
@@ -25,6 +12,7 @@ def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
|
||||
# 1. Time to INSERT 5 million rows
|
||||
# 2. Disk writes
|
||||
# 3. Disk space used
|
||||
# 4. Peak memory usage
|
||||
#
|
||||
def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
|
||||
# Create a branch for us
|
||||
@@ -55,6 +43,9 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg
|
||||
# time and I/O
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
# Record peak memory usage
|
||||
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
|
||||
|
||||
48
test_runner/performance/test_gist_build.py
Normal file
48
test_runner/performance/test_gist_build.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
#
|
||||
# Test buffering GisT build. It WAL-logs the whole relation, in 32-page chunks.
|
||||
# As of this writing, we're duplicate those giant WAL records for each page,
|
||||
# which makes the delta layer about 32x larger than it needs to be.
|
||||
#
|
||||
def test_gist_buffering_build(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_gist_buffering_build", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_gist_buffering_build')
|
||||
print("postgres is running on 'test_gist_buffering_build' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = pageserver.connect();
|
||||
pscur = psconn.cursor()
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
# Create test table.
|
||||
cur.execute("create table gist_point_tbl(id int4, p point)");
|
||||
cur.execute("insert into gist_point_tbl select g, point(g, g) from generate_series(1, 1000000) g;");
|
||||
|
||||
# Build the index.
|
||||
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
|
||||
with zenbenchmark.record_duration('build'):
|
||||
cur.execute("create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)");
|
||||
|
||||
# Flush the layers from memory to disk. This is included in the reported
|
||||
# time and I/O
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 1000000")
|
||||
|
||||
# Record peak memory usage
|
||||
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
|
||||
@@ -4,19 +4,6 @@ from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
|
||||
path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
|
||||
|
||||
totalbytes = 0
|
||||
for root, dirs, files in os.walk(path):
|
||||
for name in files:
|
||||
totalbytes += os.path.getsize(os.path.join(root, name))
|
||||
|
||||
if 'wal' in dirs:
|
||||
dirs.remove('wal') # don't visit 'wal' subdirectory
|
||||
|
||||
return totalbytes
|
||||
|
||||
#
|
||||
# Run a very short pgbench test.
|
||||
#
|
||||
@@ -64,5 +51,5 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
|
||||
|
||||
74
test_runner/performance/test_write_amplification.py
Normal file
74
test_runner/performance/test_write_amplification.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# Demonstrate Write Amplification with naive oldest-first layer checkpointing
|
||||
# algorithm.
|
||||
#
|
||||
# In each iteration of the test, we create a new table that's slightly under 10
|
||||
# MB in size (10 MB is the current "segment size" used by the page server). Then
|
||||
# we make a tiny update to all the tables already created. This creates a WAL
|
||||
# pattern where you have a lot of updates on one segment (the newly created
|
||||
# one), alternating with a small updates on all relations. This is the worst
|
||||
# case scenario for the naive checkpointing policy where we write out the layers
|
||||
# in LSN order, writing the oldest layer first. That creates a new 10 MB image
|
||||
# layer to be created for each of those small updates. This is the Write
|
||||
# Amplification problem at its finest.
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
def test_write_amplification(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_write_amplification", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_write_amplification')
|
||||
print("postgres is running on 'test_write_amplification' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = pageserver.connect();
|
||||
pscur = psconn.cursor()
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
|
||||
with zenbenchmark.record_duration('run'):
|
||||
|
||||
# NOTE: Because each iteration updates every table already created,
|
||||
# the runtime and write amplification is O(n^2), where n is the
|
||||
# number of iterations.
|
||||
for i in range(25):
|
||||
cur.execute(f'''
|
||||
CREATE TABLE tbl{i} AS
|
||||
SELECT g as i, 'long string to consume some space' || g as t
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
cur.execute(f"create index on tbl{i} (i);")
|
||||
for j in range(1, i):
|
||||
cur.execute(f"delete from tbl{j} where i = {i}")
|
||||
|
||||
# Force checkpointing. As of this writing, we don't have
|
||||
# a back-pressure mechanism, and the page server cannot
|
||||
# keep up digesting and checkpointing the WAL at the
|
||||
# rate that it is generated. If we don't force a
|
||||
# checkpoint, the WAL will just accumulate in memory
|
||||
# until you hit OOM error. So in effect, we use much
|
||||
# more memory to hold the incoming WAL, and write them
|
||||
# out in larger batches than we'd really want. Using
|
||||
# more memory hides the write amplification problem this
|
||||
# test tries to demonstrate.
|
||||
#
|
||||
# The write amplification problem is real, and using
|
||||
# more memory isn't the right solution. We could
|
||||
# demonstrate the effect also by generating the WAL
|
||||
# slower, adding some delays in this loop. But forcing
|
||||
# the the checkpointing and GC makes the test go faster,
|
||||
# with the same total I/O effect.
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
|
||||
zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: 56c561aa77...5387eb4a3b
@@ -28,9 +28,11 @@ humantime = "2.1.0"
|
||||
walkdir = "2"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
hex = "0.4.3"
|
||||
const_format = "0.2.21"
|
||||
|
||||
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
|
||||
pageserver = { path = "../pageserver" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
@@ -89,12 +89,12 @@ A: Page Server is a single server which can be lost. As our primary
|
||||
|
||||
Q: What if the compute node evicts a page, needs it back, but the page is yet
|
||||
to reach the Page Server?
|
||||
A: If the compute node has evicted a page, all changes from that page are
|
||||
already committed, i.e. they are saved on majority of WAL safekeepers. These
|
||||
WAL records will eventually reach the Page Server. The Page Server notes
|
||||
that the compute note requests pages with a very recent LSN and will not
|
||||
respond to the compute node until it a corresponding WAL is received from WAL
|
||||
safekeepers.
|
||||
A: If the compute node has evicted a page, changes to it have been WAL-logged
|
||||
(that's why it is called Write Ahead logging; there are some exceptions like
|
||||
index builds, but these are exceptions). These WAL records will eventually
|
||||
reach the Page Server. The Page Server notes that the compute note requests
|
||||
pages with a very recent LSN and will not respond to the compute node until a
|
||||
corresponding WAL is received from WAL safekeepers.
|
||||
|
||||
Q: How long may Page Server wait for?
|
||||
A: Not too long, hopefully. If a page is evicted, it probably was not used for
|
||||
|
||||
@@ -1,35 +1,47 @@
|
||||
//
|
||||
// Main entry point for the wal_acceptor executable
|
||||
// Main entry point for the safekeeper executable
|
||||
//
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
use const_format::formatcp;
|
||||
use daemonize::Daemonize;
|
||||
use log::*;
|
||||
use std::env;
|
||||
use std::net::TcpListener;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::logging;
|
||||
|
||||
use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR};
|
||||
use walkeeper::s3_offload;
|
||||
use walkeeper::wal_service;
|
||||
use walkeeper::WalAcceptorConf;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith wal_acceptor")
|
||||
zenith_metrics::set_common_metrics_prefix("safekeeper");
|
||||
let arg_matches = App::new("Zenith safekeeper")
|
||||
.about("Store WAL stream to local file system and push it to WAL receivers")
|
||||
.arg(
|
||||
Arg::with_name("datadir")
|
||||
.short("D")
|
||||
.long("dir")
|
||||
.takes_value(true)
|
||||
.help("Path to the WAL acceptor data directory"),
|
||||
.help("Path to the safekeeper data directory"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen")
|
||||
Arg::with_name("listen-pg")
|
||||
.short("l")
|
||||
.long("listen")
|
||||
.long("listen-pg")
|
||||
.alias("listen") // for compatibility
|
||||
.takes_value(true)
|
||||
.help("listen for incoming connections on ip:port (default: 127.0.0.1:5454)"),
|
||||
.help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen-http")
|
||||
.long("listen-http")
|
||||
.takes_value(true)
|
||||
.help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("pageserver")
|
||||
@@ -70,7 +82,8 @@ fn main() -> Result<()> {
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
pageserver_addr: None,
|
||||
listen_addr: "localhost:5454".to_string(),
|
||||
listen_pg_addr: DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
ttl: None,
|
||||
recall_period: None,
|
||||
pageserver_auth_token: env::var("PAGESERVER_AUTH_TOKEN").ok(),
|
||||
@@ -91,8 +104,12 @@ fn main() -> Result<()> {
|
||||
conf.daemonize = true;
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen") {
|
||||
conf.listen_addr = addr.to_owned();
|
||||
if let Some(addr) = arg_matches.value_of("listen-pg") {
|
||||
conf.listen_pg_addr = addr.to_owned();
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen-http") {
|
||||
conf.listen_http_addr = addr.to_owned();
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("pageserver") {
|
||||
@@ -111,8 +128,19 @@ fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
let log_filename = conf.data_dir.join("wal_acceptor.log");
|
||||
let (_scope_guard, log_file) = logging::init(log_filename, conf.daemonize)?;
|
||||
let log_filename = conf.data_dir.join("safekeeper.log");
|
||||
let log_file = logging::init(log_filename, conf.daemonize)?;
|
||||
|
||||
let http_listener = TcpListener::bind(conf.listen_http_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
info!("Starting safekeeper on {}", conf.listen_pg_addr);
|
||||
let pg_listener = TcpListener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
@@ -123,7 +151,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
let stderr = log_file;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file("wal_acceptor.pid")
|
||||
.pid_file("safekeeper.pid")
|
||||
.working_directory(Path::new("."))
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
@@ -136,6 +164,16 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
|
||||
let mut threads = Vec::new();
|
||||
|
||||
let http_endpoint_thread = thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(|| {
|
||||
// No authentication at all: read-only metrics only, early stage.
|
||||
let router = endpoint::make_router();
|
||||
endpoint::serve_thread_main(router, http_listener).unwrap();
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(http_endpoint_thread);
|
||||
|
||||
if conf.ttl.is_some() {
|
||||
let s3_conf = conf.clone();
|
||||
let s3_offload_thread = thread::Builder::new()
|
||||
@@ -152,7 +190,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
.name("WAL acceptor thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
let thread_result = wal_service::thread_main(conf);
|
||||
let thread_result = wal_service::thread_main(conf, pg_listener);
|
||||
if let Err(e) = thread_result {
|
||||
info!("wal_service thread terminated: {}", e);
|
||||
}
|
||||
@@ -11,12 +11,23 @@ pub mod send_wal;
|
||||
pub mod timeline;
|
||||
pub mod wal_service;
|
||||
|
||||
pub mod defaults {
|
||||
use const_format::formatcp;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WalAcceptorConf {
|
||||
pub data_dir: PathBuf,
|
||||
pub daemonize: bool,
|
||||
pub no_sync: bool,
|
||||
pub listen_addr: String,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub pageserver_addr: Option<String>,
|
||||
// TODO (create issue) this is temporary, until protocol between PG<->SK<->PS rework
|
||||
pub pageserver_auth_token: Option<String>,
|
||||
|
||||
@@ -42,7 +42,7 @@ fn request_callback(conf: WalAcceptorConf, timelineid: ZTimelineId, tenantid: ZT
|
||||
);
|
||||
|
||||
// use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses
|
||||
let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_addr);
|
||||
let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_pg_addr);
|
||||
let me_conf: Config = me_connstr.parse().unwrap();
|
||||
let (host, port) = connection_host_port(&me_conf);
|
||||
let callme = format!(
|
||||
|
||||
@@ -15,8 +15,11 @@ use std::cmp::min;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::replication::HotStandbyFeedback;
|
||||
use postgres_ffi::xlog_utils::MAX_SEND_SIZE;
|
||||
use zenith_metrics::{register_gauge_vec, Gauge, GaugeVec};
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::pq_proto::SystemId;
|
||||
@@ -279,6 +282,38 @@ pub trait Storage {
|
||||
fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
|
||||
// i64 is faster than f64, so update to u64 when available.
|
||||
static ref FLUSH_LSN_GAUGE: GaugeVec = register_gauge_vec!(
|
||||
"safekeeper_flush_lsn",
|
||||
"Current flush_lsn, grouped by timeline",
|
||||
&["ztli"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_flush_lsn int gauge vec");
|
||||
static ref COMMIT_LSN_GAUGE: GaugeVec = register_gauge_vec!(
|
||||
"safekeeper_commit_lsn",
|
||||
"Current commit_lsn (not necessarily persisted to disk), grouped by timeline",
|
||||
&["ztli"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_commit_lsn int gauge vec");
|
||||
}
|
||||
|
||||
struct SafeKeeperMetrics {
|
||||
flush_lsn: Gauge,
|
||||
commit_lsn: Gauge,
|
||||
}
|
||||
|
||||
impl SafeKeeperMetrics {
|
||||
fn new(ztli: ZTimelineId) -> SafeKeeperMetrics {
|
||||
let ztli_str = format!("{}", ztli);
|
||||
SafeKeeperMetrics {
|
||||
flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&ztli_str]),
|
||||
commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&ztli_str]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// SafeKeeper which consumes events (messages from compute) and provides
|
||||
/// replies.
|
||||
pub struct SafeKeeper<ST: Storage> {
|
||||
@@ -286,6 +321,8 @@ pub struct SafeKeeper<ST: Storage> {
|
||||
/// Established by reading wal.
|
||||
pub flush_lsn: Lsn,
|
||||
pub tli: u32,
|
||||
// Cached metrics so we don't have to recompute labels on each update.
|
||||
metrics: Option<SafeKeeperMetrics>,
|
||||
/// not-yet-flushed pairs of same named fields in s.*
|
||||
pub commit_lsn: Lsn,
|
||||
pub truncate_lsn: Lsn,
|
||||
@@ -304,6 +341,7 @@ where
|
||||
SafeKeeper {
|
||||
flush_lsn,
|
||||
tli,
|
||||
metrics: None,
|
||||
commit_lsn: state.commit_lsn,
|
||||
truncate_lsn: state.truncate_lsn,
|
||||
storage,
|
||||
@@ -355,6 +393,8 @@ where
|
||||
self.s.server.wal_seg_size = msg.wal_seg_size;
|
||||
self.storage.persist(&self.s, true)?;
|
||||
|
||||
self.metrics = Some(SafeKeeperMetrics::new(self.s.server.ztli));
|
||||
|
||||
info!(
|
||||
"processed greeting from proposer {:?}, sending term {:?}",
|
||||
msg.proposer_id, self.s.acceptor_state.term
|
||||
@@ -478,6 +518,11 @@ where
|
||||
}
|
||||
if last_rec_lsn > self.flush_lsn {
|
||||
self.flush_lsn = last_rec_lsn;
|
||||
self.metrics
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.flush_lsn
|
||||
.set(u64::from(self.flush_lsn) as f64);
|
||||
}
|
||||
|
||||
// Advance commit_lsn taking into account what we have locally. xxx this
|
||||
@@ -495,6 +540,11 @@ where
|
||||
sync_control_file |=
|
||||
commit_lsn >= msg.h.epoch_start_lsn && self.s.commit_lsn < msg.h.epoch_start_lsn;
|
||||
self.commit_lsn = commit_lsn;
|
||||
self.metrics
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.commit_lsn
|
||||
.set(u64::from(self.commit_lsn) as f64);
|
||||
}
|
||||
|
||||
self.truncate_lsn = msg.h.truncate_lsn;
|
||||
|
||||
@@ -20,7 +20,6 @@ use crate::timeline::CreateControlFile;
|
||||
|
||||
/// Handler for streaming WAL from acceptor
|
||||
pub struct SendWalHandler {
|
||||
/// wal acceptor configuration
|
||||
pub conf: WalAcceptorConf,
|
||||
/// assigned application name
|
||||
pub appname: Option<String>,
|
||||
|
||||
@@ -112,7 +112,7 @@ impl SharedState {
|
||||
}
|
||||
match opts.open(&control_file_path) {
|
||||
Ok(mut file) => {
|
||||
// Lock file to prevent two or more active wal_acceptors
|
||||
// Lock file to prevent two or more active safekeepers
|
||||
match file.try_lock_exclusive() {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
|
||||
@@ -12,13 +12,7 @@ use crate::WalAcceptorConf;
|
||||
use zenith_utils::postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
pub fn thread_main(conf: WalAcceptorConf) -> Result<()> {
|
||||
info!("Starting wal acceptor on {}", conf.listen_addr);
|
||||
let listener = TcpListener::bind(conf.listen_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
pub fn thread_main(conf: WalAcceptorConf, listener: TcpListener) -> Result<()> {
|
||||
loop {
|
||||
match listener.accept() {
|
||||
Ok((socket, peer_addr)) => {
|
||||
@@ -41,8 +35,8 @@ fn handle_socket(socket: TcpStream, conf: WalAcceptorConf) -> Result<()> {
|
||||
socket.set_nodelay(true)?;
|
||||
|
||||
let mut conn_handler = SendWalHandler::new(conf);
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
|
||||
// libpq replication protocol between wal_acceptor and replicas/pagers
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, false)?;
|
||||
// libpq replication protocol between safekeeper and replicas/pagers
|
||||
pgbackend.run(&mut conn_handler)?;
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -88,7 +88,12 @@ fn main() -> Result<()> {
|
||||
)
|
||||
.subcommand(SubCommand::with_name("status"))
|
||||
.subcommand(SubCommand::with_name("start").about("Start local pageserver"))
|
||||
.subcommand(SubCommand::with_name("stop").about("Stop local pageserver"))
|
||||
.subcommand(SubCommand::with_name("stop").about("Stop local pageserver")
|
||||
.arg(Arg::with_name("immediate")
|
||||
.help("Don't flush repository data at shutdown")
|
||||
.required(false)
|
||||
)
|
||||
)
|
||||
.subcommand(SubCommand::with_name("restart").about("Restart local pageserver"))
|
||||
.subcommand(
|
||||
SubCommand::with_name("pg")
|
||||
@@ -196,10 +201,12 @@ fn main() -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
("stop", Some(_sub_m)) => {
|
||||
("stop", Some(stop_match)) => {
|
||||
let pageserver = PageServerNode::from_env(&env);
|
||||
|
||||
if let Err(e) = pageserver.stop() {
|
||||
let immediate = stop_match.is_present("immediate");
|
||||
|
||||
if let Err(e) = pageserver.stop(immediate) {
|
||||
eprintln!("pageserver stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -208,7 +215,8 @@ fn main() -> Result<()> {
|
||||
("restart", Some(_sub_m)) => {
|
||||
let pageserver = PageServerNode::from_env(&env);
|
||||
|
||||
if let Err(e) = pageserver.stop() {
|
||||
//TODO what shutdown strategy should we use here?
|
||||
if let Err(e) = pageserver.stop(false) {
|
||||
eprintln!("pageserver stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
use lazy_static::lazy_static;
|
||||
use once_cell::race::OnceBox;
|
||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||
pub use prometheus::{register_gauge, Gauge};
|
||||
pub use prometheus::{register_gauge_vec, GaugeVec};
|
||||
pub use prometheus::{register_histogram, Histogram};
|
||||
pub use prometheus::{register_histogram_vec, HistogramVec};
|
||||
pub use prometheus::{register_int_counter, IntCounter};
|
||||
@@ -21,7 +23,7 @@ pub use wrappers::{CountedReader, CountedWriter};
|
||||
/// Metrics gathering is a relatively simple and standalone operation, so
|
||||
/// it might be fine to do it this way to keep things simple.
|
||||
pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
|
||||
update_io_metrics();
|
||||
update_rusage_metrics();
|
||||
prometheus::gather()
|
||||
}
|
||||
|
||||
@@ -31,16 +33,29 @@ static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new();
|
||||
/// name like 'pageserver'. Should be executed exactly once in the beginning of
|
||||
/// any executable which uses common metrics.
|
||||
pub fn set_common_metrics_prefix(prefix: &'static str) {
|
||||
COMMON_METRICS_PREFIX.set(prefix.into()).unwrap();
|
||||
// Not unwrap() because metrics may be initialized after multiple threads have been started.
|
||||
COMMON_METRICS_PREFIX
|
||||
.set(prefix.into())
|
||||
.unwrap_or_else(|_| {
|
||||
eprintln!(
|
||||
"set_common_metrics_prefix() was called second time with '{}', exiting",
|
||||
prefix
|
||||
);
|
||||
std::process::exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
/// Prepends a prefix to a common metric name so they are distinguished between
|
||||
/// different services, see https://github.com/zenithdb/zenith/pull/681
|
||||
/// different services, see <https://github.com/zenithdb/zenith/pull/681>
|
||||
/// A call to set_common_metrics_prefix() is necessary prior to calling this.
|
||||
pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String {
|
||||
// Not unwrap() because metrics may be initialized after multiple threads have been started.
|
||||
format!(
|
||||
"{}_{}",
|
||||
COMMON_METRICS_PREFIX.get().unwrap(),
|
||||
COMMON_METRICS_PREFIX.get().unwrap_or_else(|| {
|
||||
eprintln!("set_common_metrics_prefix() was not called, but metrics are used, exiting");
|
||||
std::process::exit(1);
|
||||
}),
|
||||
unprefixed_metric_name
|
||||
)
|
||||
}
|
||||
@@ -52,6 +67,11 @@ lazy_static! {
|
||||
&["io_operation"]
|
||||
)
|
||||
.expect("Failed to register disk i/o bytes int gauge vec");
|
||||
static ref MAXRSS_KB: IntGauge = register_int_gauge!(
|
||||
new_common_metric_name("maxrss_kb"),
|
||||
"Memory usage (Maximum Resident Set Size)"
|
||||
)
|
||||
.expect("Failed to register maxrss_kb int gauge");
|
||||
}
|
||||
|
||||
// Records I/O stats in a "cross-platform" way.
|
||||
@@ -63,7 +83,7 @@ lazy_static! {
|
||||
// We know the size of the block, so we can determine the I/O bytes out of it.
|
||||
// The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
fn update_io_metrics() {
|
||||
fn update_rusage_metrics() {
|
||||
let rusage_stats = get_rusage_stats();
|
||||
|
||||
const BYTES_IN_BLOCK: i64 = 512;
|
||||
@@ -73,6 +93,7 @@ fn update_io_metrics() {
|
||||
DISK_IO_BYTES
|
||||
.with_label_values(&["write"])
|
||||
.set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
|
||||
MAXRSS_KB.set(rusage_stats.ru_maxrss);
|
||||
}
|
||||
|
||||
fn get_rusage_stats() -> libc::rusage {
|
||||
|
||||
@@ -18,12 +18,9 @@ serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
thiserror = "1.0"
|
||||
tokio = "1.11"
|
||||
|
||||
slog-async = "2.6.0"
|
||||
slog-stdlog = "4.1.0"
|
||||
slog-scope = "4.4.0"
|
||||
slog-term = "2.8.0"
|
||||
slog = "2.7.0"
|
||||
tracing = "0.1"
|
||||
tracing-log = "0.1"
|
||||
tracing-subscriber = "0.2"
|
||||
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
@@ -12,8 +12,17 @@ use std::net::TcpListener;
|
||||
use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter};
|
||||
use zenith_metrics::{Encoder, TextEncoder};
|
||||
|
||||
use std::sync::Mutex;
|
||||
use tokio::sync::oneshot::Sender;
|
||||
|
||||
use super::error::ApiError;
|
||||
|
||||
lazy_static! {
|
||||
/// Channel used to send shutdown signal - wrapped in an Option to allow
|
||||
/// it to be taken by value (since oneshot channels consume themselves on send)
|
||||
static ref SHUTDOWN_SENDER: Mutex<Option<Sender<()>>> = Mutex::new(None);
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
|
||||
new_common_metric_name("serve_metrics_count"),
|
||||
@@ -143,11 +152,18 @@ pub fn check_permission(req: &Request<Body>, tenantid: Option<ZTenantId>) -> Res
|
||||
}
|
||||
}
|
||||
|
||||
// Send shutdown signal
|
||||
pub fn shutdown() {
|
||||
if let Some(tx) = SHUTDOWN_SENDER.lock().unwrap().take() {
|
||||
let _ = tx.send(());
|
||||
}
|
||||
}
|
||||
|
||||
pub fn serve_thread_main(
|
||||
router_builder: RouterBuilder<hyper::Body, ApiError>,
|
||||
listener: TcpListener,
|
||||
) -> anyhow::Result<()> {
|
||||
log::info!("Starting a http endoint at {}", listener.local_addr()?);
|
||||
log::info!("Starting a http endpoint at {}", listener.local_addr()?);
|
||||
|
||||
// Create a Service from the router above to handle incoming requests.
|
||||
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||
@@ -159,7 +175,14 @@ pub fn serve_thread_main(
|
||||
|
||||
let _guard = runtime.enter();
|
||||
|
||||
let server = Server::from_tcp(listener)?.serve(service);
|
||||
let (send, recv) = tokio::sync::oneshot::channel::<()>();
|
||||
*SHUTDOWN_SENDER.lock().unwrap() = Some(send);
|
||||
|
||||
let server = Server::from_tcp(listener)?
|
||||
.serve(service)
|
||||
.with_graceful_shutdown(async {
|
||||
recv.await.ok();
|
||||
});
|
||||
|
||||
runtime.block_on(server)?;
|
||||
|
||||
|
||||
@@ -8,6 +8,9 @@ pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
pub mod seqwait;
|
||||
|
||||
/// append only ordered map implemented with a Vec
|
||||
pub mod vec_map;
|
||||
|
||||
// Async version of SeqWait. Currently unused.
|
||||
// pub mod seqwait_async;
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use slog::{Drain, Level};
|
||||
use std::{
|
||||
fs::{File, OpenOptions},
|
||||
path::Path,
|
||||
@@ -6,10 +5,12 @@ use std::{
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
pub fn init(
|
||||
log_filename: impl AsRef<Path>,
|
||||
daemonize: bool,
|
||||
) -> Result<(slog_scope::GlobalLoggerGuard, File)> {
|
||||
use tracing::subscriber::set_global_default;
|
||||
use tracing_log::LogTracer;
|
||||
use tracing_subscriber::fmt;
|
||||
use tracing_subscriber::{layer::SubscriberExt, EnvFilter, Registry};
|
||||
|
||||
pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
|
||||
// Don't open the same file for output multiple times;
|
||||
// the different fds could overwrite each other's output.
|
||||
let log_file = OpenOptions::new()
|
||||
@@ -18,30 +19,38 @@ pub fn init(
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;
|
||||
|
||||
let default_filter_str = "info";
|
||||
|
||||
// We fall back to printing all spans at info-level or above if
|
||||
// the RUST_LOG environment variable is not set.
|
||||
let env_filter =
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(default_filter_str));
|
||||
|
||||
// we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
|
||||
// if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
|
||||
// for example to be in line with docker log command which expects logs comimg from stdout
|
||||
let guard = if daemonize {
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file.try_clone()?);
|
||||
let drain = slog_term::FullFormat::new(decorator)
|
||||
.build()
|
||||
.filter_level(Level::Info)
|
||||
.fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
slog_scope::set_global_logger(logger)
|
||||
//
|
||||
// TODO: perhaps use a more human-readable format when !daemonize
|
||||
if daemonize {
|
||||
let x = log_file.try_clone().unwrap();
|
||||
|
||||
let fmt_layer = fmt::layer()
|
||||
.pretty()
|
||||
.with_target(false) // don't include event targets
|
||||
.with_ansi(false) // don't use colors in log file
|
||||
.with_writer(move || x.try_clone().unwrap());
|
||||
let subscriber = Registry::default().with(env_filter).with(fmt_layer);
|
||||
|
||||
set_global_default(subscriber).expect("Failed to set subscriber");
|
||||
} else {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
let drain = slog_term::FullFormat::new(decorator)
|
||||
.build()
|
||||
.filter_level(Level::Info)
|
||||
.fuse();
|
||||
let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
slog_scope::set_global_logger(logger)
|
||||
};
|
||||
let fmt_layer = fmt::layer().with_target(false); // don't include event targets
|
||||
let subscriber = Registry::default().with(env_filter).with(fmt_layer);
|
||||
|
||||
// initialise forwarding of std log calls
|
||||
slog_stdlog::init()?;
|
||||
set_global_default(subscriber).expect("Failed to set subscriber");
|
||||
}
|
||||
|
||||
Ok((guard, log_file))
|
||||
// Redirect all `log`'s events to our subscriber
|
||||
LogTracer::init().expect("Failed to set logger");
|
||||
|
||||
Ok(log_file)
|
||||
}
|
||||
|
||||
@@ -13,7 +13,11 @@ use serde::{Deserialize, Serialize};
|
||||
use std::io::{self, Write};
|
||||
use std::net::{Shutdown, SocketAddr, TcpStream};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
static PGBACKEND_SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
|
||||
|
||||
pub trait Handler {
|
||||
/// Handle single query.
|
||||
@@ -135,13 +139,32 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
|
||||
query_string
|
||||
}
|
||||
|
||||
// Helper function for socket read loops
|
||||
pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
|
||||
for cause in error.chain() {
|
||||
if let Some(io_error) = cause.downcast_ref::<io::Error>() {
|
||||
if io_error.kind() == std::io::ErrorKind::WouldBlock {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
impl PostgresBackend {
|
||||
pub fn new(
|
||||
socket: TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
set_read_timeout: bool,
|
||||
) -> io::Result<Self> {
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
if set_read_timeout {
|
||||
socket
|
||||
.set_read_timeout(Some(Duration::from_secs(5)))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
|
||||
buf_out: BytesMut::with_capacity(10 * 1024),
|
||||
@@ -229,12 +252,26 @@ impl PostgresBackend {
|
||||
|
||||
let mut unnamed_query_string = Bytes::new();
|
||||
|
||||
while let Some(msg) = self.read_message()? {
|
||||
trace!("got message {:?}", msg);
|
||||
while !PGBACKEND_SHUTDOWN_REQUESTED.load(Ordering::Relaxed) {
|
||||
match self.read_message() {
|
||||
Ok(message) => {
|
||||
if let Some(msg) = message {
|
||||
trace!("got message {:?}", msg);
|
||||
|
||||
match self.process_message(handler, msg, &mut unnamed_query_string)? {
|
||||
ProcessMsgResult::Continue => continue,
|
||||
ProcessMsgResult::Break => break,
|
||||
match self.process_message(handler, msg, &mut unnamed_query_string)? {
|
||||
ProcessMsgResult::Continue => continue,
|
||||
ProcessMsgResult::Break => break,
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// If it is a timeout error, continue the loop
|
||||
if !is_socket_read_timed_out(&e) {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -427,3 +464,8 @@ impl PostgresBackend {
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
}
|
||||
|
||||
// Set the flag to inform connections to cancel
|
||||
pub fn set_pgbackend_shutdown_requested() {
|
||||
PGBACKEND_SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
293
zenith_utils/src/vec_map.rs
Normal file
293
zenith_utils/src/vec_map.rs
Normal file
@@ -0,0 +1,293 @@
|
||||
use std::{cmp::Ordering, ops::RangeBounds};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Ordered map datastructure implemented in a Vec.
|
||||
/// Append only - can only add keys that are larger than the
|
||||
/// current max key.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct VecMap<K, V>(Vec<(K, V)>);
|
||||
|
||||
impl<K, V> Default for VecMap<K, V> {
|
||||
fn default() -> Self {
|
||||
VecMap(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct InvalidKey;
|
||||
|
||||
impl<K: Ord, V> VecMap<K, V> {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[(K, V)] {
|
||||
self.0.as_slice()
|
||||
}
|
||||
|
||||
/// This function may panic if given a range where the lower bound is
|
||||
/// greater than the upper bound.
|
||||
pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
|
||||
use std::ops::Bound::*;
|
||||
|
||||
let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
|
||||
|
||||
let start_idx = match range.start_bound() {
|
||||
Unbounded => 0,
|
||||
Included(k) => binary_search(k).unwrap_or_else(std::convert::identity),
|
||||
Excluded(k) => match binary_search(k) {
|
||||
Ok(idx) => idx + 1,
|
||||
Err(idx) => idx,
|
||||
},
|
||||
};
|
||||
|
||||
let end_idx = match range.end_bound() {
|
||||
Unbounded => self.0.len(),
|
||||
Included(k) => match binary_search(k) {
|
||||
Ok(idx) => idx + 1,
|
||||
Err(idx) => idx,
|
||||
},
|
||||
Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
|
||||
};
|
||||
|
||||
&self.0[start_idx..end_idx]
|
||||
}
|
||||
|
||||
/// Add a key value pair to the map.
|
||||
/// If `key` is less than or equal to the current maximum key
|
||||
/// the pair will not be added and InvalidKey error will be returned.
|
||||
pub fn append(&mut self, key: K, value: V) -> Result<(), InvalidKey> {
|
||||
if let Some((last_key, _last_value)) = self.0.last() {
|
||||
if &key <= last_key {
|
||||
return Err(InvalidKey);
|
||||
}
|
||||
}
|
||||
|
||||
self.0.push((key, value));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update the maximum key value pair or add a new key value pair to the map.
|
||||
/// If `key` is less than the current maximum key no updates or additions
|
||||
/// will occur and InvalidKey error will be returned.
|
||||
pub fn append_or_update_last(&mut self, key: K, mut value: V) -> Result<Option<V>, InvalidKey> {
|
||||
if let Some((last_key, last_value)) = self.0.last_mut() {
|
||||
match key.cmp(last_key) {
|
||||
Ordering::Less => return Err(InvalidKey),
|
||||
Ordering::Equal => {
|
||||
std::mem::swap(last_value, &mut value);
|
||||
return Ok(Some(value));
|
||||
}
|
||||
Ordering::Greater => {}
|
||||
}
|
||||
}
|
||||
|
||||
self.0.push((key, value));
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Split the map into two.
|
||||
///
|
||||
/// The left map contains everything before `cutoff` (exclusive).
|
||||
/// Right map contains `cutoff` and everything after (inclusive).
|
||||
pub fn split_at(&self, cutoff: &K) -> (Self, Self)
|
||||
where
|
||||
K: Clone,
|
||||
V: Clone,
|
||||
{
|
||||
let split_idx = self
|
||||
.0
|
||||
.binary_search_by_key(&cutoff, extract_key)
|
||||
.unwrap_or_else(std::convert::identity);
|
||||
|
||||
(
|
||||
VecMap(self.0[..split_idx].to_vec()),
|
||||
VecMap(self.0[split_idx..].to_vec()),
|
||||
)
|
||||
}
|
||||
|
||||
/// Move items from `other` to the end of `self`, leaving `other` empty.
|
||||
/// If any keys in `other` is less than or equal to any key in `self`,
|
||||
/// `InvalidKey` error will be returned and no mutation will occur.
|
||||
pub fn extend(&mut self, other: &mut Self) -> Result<(), InvalidKey> {
|
||||
let self_last_opt = self.0.last().map(extract_key);
|
||||
let other_first_opt = other.0.last().map(extract_key);
|
||||
|
||||
if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
|
||||
if self_last >= other_first {
|
||||
return Err(InvalidKey);
|
||||
}
|
||||
}
|
||||
|
||||
self.0.append(&mut other.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
||||
&entry.0
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{collections::BTreeMap, ops::Bound};
|
||||
|
||||
use super::VecMap;
|
||||
|
||||
#[test]
|
||||
fn unbounded_range() {
|
||||
let mut vec = VecMap::default();
|
||||
vec.append(0, ()).unwrap();
|
||||
|
||||
assert_eq!(vec.slice_range(0..0), &[]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn invalid_ordering_range() {
|
||||
let mut vec = VecMap::default();
|
||||
vec.append(0, ()).unwrap();
|
||||
|
||||
#[allow(clippy::reversed_empty_ranges)]
|
||||
vec.slice_range(1..0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_tests() {
|
||||
let mut vec = VecMap::default();
|
||||
vec.append(0, ()).unwrap();
|
||||
vec.append(2, ()).unwrap();
|
||||
vec.append(4, ()).unwrap();
|
||||
|
||||
assert_eq!(vec.slice_range(0..0), &[]);
|
||||
assert_eq!(vec.slice_range(0..1), &[(0, ())]);
|
||||
assert_eq!(vec.slice_range(0..2), &[(0, ())]);
|
||||
assert_eq!(vec.slice_range(0..3), &[(0, ()), (2, ())]);
|
||||
|
||||
assert_eq!(vec.slice_range(..0), &[]);
|
||||
assert_eq!(vec.slice_range(..1), &[(0, ())]);
|
||||
|
||||
assert_eq!(vec.slice_range(..3), &[(0, ()), (2, ())]);
|
||||
assert_eq!(vec.slice_range(..3), &[(0, ()), (2, ())]);
|
||||
|
||||
assert_eq!(vec.slice_range(0..=0), &[(0, ())]);
|
||||
assert_eq!(vec.slice_range(0..=1), &[(0, ())]);
|
||||
assert_eq!(vec.slice_range(0..=2), &[(0, ()), (2, ())]);
|
||||
assert_eq!(vec.slice_range(0..=3), &[(0, ()), (2, ())]);
|
||||
|
||||
assert_eq!(vec.slice_range(..=0), &[(0, ())]);
|
||||
assert_eq!(vec.slice_range(..=1), &[(0, ())]);
|
||||
assert_eq!(vec.slice_range(..=2), &[(0, ()), (2, ())]);
|
||||
assert_eq!(vec.slice_range(..=3), &[(0, ()), (2, ())]);
|
||||
}
|
||||
|
||||
struct BoundIter {
|
||||
min: i32,
|
||||
max: i32,
|
||||
|
||||
next: Option<Bound<i32>>,
|
||||
}
|
||||
|
||||
impl BoundIter {
|
||||
fn new(min: i32, max: i32) -> Self {
|
||||
Self {
|
||||
min,
|
||||
max,
|
||||
|
||||
next: Some(Bound::Unbounded),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for BoundIter {
|
||||
type Item = Bound<i32>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let cur = self.next?;
|
||||
|
||||
self.next = match &cur {
|
||||
Bound::Unbounded => Some(Bound::Included(self.min)),
|
||||
Bound::Included(x) => {
|
||||
if *x >= self.max {
|
||||
Some(Bound::Excluded(self.min))
|
||||
} else {
|
||||
Some(Bound::Included(x + 1))
|
||||
}
|
||||
}
|
||||
Bound::Excluded(x) => {
|
||||
if *x >= self.max {
|
||||
None
|
||||
} else {
|
||||
Some(Bound::Excluded(x + 1))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Some(cur)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_exhaustive() {
|
||||
let map: BTreeMap<i32, ()> = (1..=7).step_by(2).map(|x| (x, ())).collect();
|
||||
let mut vec = VecMap::default();
|
||||
for &key in map.keys() {
|
||||
vec.append(key, ()).unwrap();
|
||||
}
|
||||
|
||||
const RANGE_MIN: i32 = 0;
|
||||
const RANGE_MAX: i32 = 8;
|
||||
for lower_bound in BoundIter::new(RANGE_MIN, RANGE_MAX) {
|
||||
let ub_min = match lower_bound {
|
||||
Bound::Unbounded => RANGE_MIN,
|
||||
Bound::Included(x) => x,
|
||||
Bound::Excluded(x) => x + 1,
|
||||
};
|
||||
for upper_bound in BoundIter::new(ub_min, RANGE_MAX) {
|
||||
let map_range: Vec<(i32, ())> = map
|
||||
.range((lower_bound, upper_bound))
|
||||
.map(|(&x, _)| (x, ()))
|
||||
.collect();
|
||||
let vec_slice = vec.slice_range((lower_bound, upper_bound));
|
||||
|
||||
assert_eq!(map_range, vec_slice);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extend() {
|
||||
let mut left = VecMap::default();
|
||||
left.append(0, ()).unwrap();
|
||||
assert_eq!(left.as_slice(), &[(0, ())]);
|
||||
|
||||
let mut empty = VecMap::default();
|
||||
left.extend(&mut empty).unwrap();
|
||||
assert_eq!(left.as_slice(), &[(0, ())]);
|
||||
assert_eq!(empty.as_slice(), &[]);
|
||||
|
||||
let mut right = VecMap::default();
|
||||
right.append(1, ()).unwrap();
|
||||
|
||||
left.extend(&mut right).unwrap();
|
||||
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||
assert_eq!(right.as_slice(), &[]);
|
||||
|
||||
let mut zero_map = VecMap::default();
|
||||
zero_map.append(0, ()).unwrap();
|
||||
|
||||
left.extend(&mut zero_map).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||
assert_eq!(zero_map.as_slice(), &[(0, ())]);
|
||||
|
||||
let mut one_map = VecMap::default();
|
||||
one_map.append(1, ()).unwrap();
|
||||
|
||||
left.extend(&mut one_map).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||
assert_eq!(one_map.as_slice(), &[(1, ())]);
|
||||
}
|
||||
}
|
||||
@@ -110,7 +110,7 @@ fn ssl() {
|
||||
.unwrap();
|
||||
let tls_config = Some(Arc::new(cfg));
|
||||
|
||||
let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config).unwrap();
|
||||
let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap();
|
||||
pgb.run(&mut handler).unwrap();
|
||||
assert!(handler.got_query);
|
||||
|
||||
@@ -150,7 +150,7 @@ fn no_ssl() {
|
||||
|
||||
let mut handler = TestHandler;
|
||||
|
||||
let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None).unwrap();
|
||||
let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None, true).unwrap();
|
||||
pgb.run(&mut handler).unwrap();
|
||||
|
||||
client_jh.join().unwrap();
|
||||
@@ -214,7 +214,7 @@ fn server_forces_ssl() {
|
||||
.unwrap();
|
||||
let tls_config = Some(Arc::new(cfg));
|
||||
|
||||
let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config).unwrap();
|
||||
let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap();
|
||||
let res = pgb.run(&mut handler).unwrap_err();
|
||||
assert_eq!("client did not connect with TLS", format!("{}", res));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user