mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-18 21:50:37 +00:00
Compare commits
406 Commits
problame/h
...
release-pr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c91905e643 | ||
|
|
44b4e355a2 | ||
|
|
2eda484ef6 | ||
|
|
c7429af8a0 | ||
|
|
a753349cb0 | ||
|
|
55a68b28a2 | ||
|
|
fb0e2acb2f | ||
|
|
efaec6cdf8 | ||
|
|
3d41069dc4 | ||
|
|
dbebede7bf | ||
|
|
3e529f124f | ||
|
|
05a71c7d6a | ||
|
|
b9464865b6 | ||
|
|
1577430408 | ||
|
|
05d17a10ae | ||
|
|
2d0ea08524 | ||
|
|
c98cbbeac1 | ||
|
|
47c1640acc | ||
|
|
03666a1f37 | ||
|
|
9c92242ca0 | ||
|
|
a354071dd0 | ||
|
|
758680d4f8 | ||
|
|
1738fd0a96 | ||
|
|
87b7edfc72 | ||
|
|
def05700d5 | ||
|
|
b547681e08 | ||
|
|
0fd211537b | ||
|
|
a83bd4e81c | ||
|
|
ecdad5e6d5 | ||
|
|
d028929945 | ||
|
|
7b0e3db868 | ||
|
|
088eb72dd7 | ||
|
|
d550e3f626 | ||
|
|
8c6b41daf5 | ||
|
|
bbb050459b | ||
|
|
cab498c787 | ||
|
|
6359342ffb | ||
|
|
13285c2a5e | ||
|
|
33790d14a3 | ||
|
|
709b8cd371 | ||
|
|
1c9bbf1a92 | ||
|
|
16163fb850 | ||
|
|
73ccc2b08c | ||
|
|
c719be6474 | ||
|
|
718645e56c | ||
|
|
fbc8c36983 | ||
|
|
5519e42612 | ||
|
|
4157eaf4c5 | ||
|
|
60241127e2 | ||
|
|
f7d5322e8b | ||
|
|
41bb9c5280 | ||
|
|
69c0d61c5c | ||
|
|
63cb8ce975 | ||
|
|
907e4aa3c4 | ||
|
|
0a2a84b766 | ||
|
|
85b12ddd52 | ||
|
|
dd76f1eeee | ||
|
|
8963ac85f9 | ||
|
|
4a488b3e24 | ||
|
|
c4987b0b13 | ||
|
|
84b4821118 | ||
|
|
32ba9811f9 | ||
|
|
a0cd64c4d3 | ||
|
|
84687b743d | ||
|
|
b6f93dcec9 | ||
|
|
4f6c594973 | ||
|
|
a750c14735 | ||
|
|
9ce0dd4e55 | ||
|
|
0e1a336607 | ||
|
|
7fc2912d06 | ||
|
|
fdf231c237 | ||
|
|
1e08b5dccc | ||
|
|
030810ed3e | ||
|
|
62b74bdc2c | ||
|
|
8b7e9ed820 | ||
|
|
5dad89acd4 | ||
|
|
547b2d2827 | ||
|
|
93f29a0065 | ||
|
|
4f36494615 | ||
|
|
0a550f3e7d | ||
|
|
4bb9554e4a | ||
|
|
008616cfe6 | ||
|
|
e61ec94fbc | ||
|
|
e5152551ad | ||
|
|
b0822a5499 | ||
|
|
1fb6ab59e8 | ||
|
|
e16439400d | ||
|
|
e401f66698 | ||
|
|
2fa461b668 | ||
|
|
03d90bc0b3 | ||
|
|
268bc890ea | ||
|
|
8a6ee79f6f | ||
|
|
9052c32b46 | ||
|
|
995e729ebe | ||
|
|
76077e1ddf | ||
|
|
0467d88f06 | ||
|
|
f5eec194e7 | ||
|
|
7e00be391d | ||
|
|
d56599df2a | ||
|
|
9d9aab3680 | ||
|
|
a202b1b5cc | ||
|
|
90f731f3b1 | ||
|
|
7736b748d3 | ||
|
|
9c23333cb3 | ||
|
|
66a99009ba | ||
|
|
5d4c57491f | ||
|
|
73935ea3a2 | ||
|
|
32e595d4dd | ||
|
|
b0d69acb07 | ||
|
|
98355a419a | ||
|
|
cfb03d6cf0 | ||
|
|
d81ef3f962 | ||
|
|
5d62c67e75 | ||
|
|
53d53d5b1e | ||
|
|
29fe6ea47a | ||
|
|
640327ccb3 | ||
|
|
7cf0f6b37e | ||
|
|
03c2c569be | ||
|
|
eff6d4538a | ||
|
|
5ef7782e9c | ||
|
|
73101db8c4 | ||
|
|
bccdfc6d39 | ||
|
|
99595813bb | ||
|
|
fe07b54758 | ||
|
|
a42d173e7b | ||
|
|
e07f689238 | ||
|
|
7831eddc88 | ||
|
|
943b1bc80c | ||
|
|
95a184e9b7 | ||
|
|
3fa17e9d17 | ||
|
|
55e0fd9789 | ||
|
|
2a88889f44 | ||
|
|
5bad8126dc | ||
|
|
27bc242085 | ||
|
|
192b49cc6d | ||
|
|
e1b60f3693 | ||
|
|
2804f5323b | ||
|
|
676adc6b32 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -25,3 +25,4 @@ config-variables:
|
||||
- PGREGRESS_PG17_PROJECT_ID
|
||||
- SLACK_ON_CALL_QA_STAGING_STREAM
|
||||
- DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
|
||||
- SLACK_ON_CALL_STORAGE_STAGING_STREAM
|
||||
|
||||
25
.github/workflows/build_and_test.yml
vendored
25
.github/workflows/build_and_test.yml
vendored
@@ -346,25 +346,22 @@ jobs:
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
report-benchmarks-failures:
|
||||
report-benchmarks-results-to-slack:
|
||||
needs: [ benchmarks, create-test-report ]
|
||||
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
if: github.ref_name == 'main' && !cancelled() && contains(fromJSON('["success", "failure"]'), needs.benchmarks.result)
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: slackapi/slack-github-action@v1
|
||||
- uses: slackapi/slack-github-action@v2
|
||||
with:
|
||||
channel-id: C060CNA47S9 # on-call-staging-storage-stream
|
||||
slack-message: |
|
||||
Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
|
||||
<${{ needs.create-test-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: "${{ vars.SLACK_ON_CALL_STORAGE_STAGING_STREAM }}"
|
||||
text: |
|
||||
Benchmarks on main: *${{ needs.benchmarks.result }}*
|
||||
- <${{ needs.create-test-report.outputs.report-url }}|Allure report>
|
||||
- <${{ github.event.head_commit.url }}|${{ github.sha }}>
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
|
||||
|
||||
17
Cargo.lock
generated
17
Cargo.lock
generated
@@ -3981,9 +3981,11 @@ name = "pagectl"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
"camino",
|
||||
"clap",
|
||||
"humantime",
|
||||
"itertools 0.10.5",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
@@ -4005,6 +4007,7 @@ dependencies = [
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"bincode",
|
||||
"bit_field",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
@@ -5655,6 +5658,7 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"desim",
|
||||
"env_logger 0.10.2",
|
||||
"fail",
|
||||
"futures",
|
||||
"hex",
|
||||
@@ -5683,6 +5687,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
@@ -5707,10 +5712,13 @@ dependencies = [
|
||||
name = "safekeeper_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"const_format",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"utils",
|
||||
]
|
||||
@@ -7558,12 +7566,21 @@ dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"criterion",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
"prost",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"tracing",
|
||||
|
||||
@@ -66,6 +66,7 @@ RUN cd postgres && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
@@ -871,7 +872,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --locked --version 0.12.6 cargo-pgrx && \
|
||||
cargo install --locked --version 0.12.9 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
@@ -908,19 +909,19 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
|
||||
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
|
||||
\
|
||||
cd exts/rag && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
|
||||
\
|
||||
cd ../rag_bge_small_en_v15 && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
|
||||
\
|
||||
cd ../rag_jina_reranker_v1_tiny_en && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
@@ -945,7 +946,8 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
|
||||
# against postgres forks that decided to change their ABI name (like us).
|
||||
# With that we can build extensions without forking them and using stock
|
||||
# pgx. As this feature is new few manual version bumps were required.
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
||||
|
||||
@@ -963,7 +965,8 @@ ARG PG_VERSION
|
||||
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
|
||||
echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
|
||||
@@ -984,9 +987,8 @@ ARG PG_VERSION
|
||||
RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
|
||||
echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
|
||||
# TODO update pgrx version in the pg_tiktoken repo and remove this line
|
||||
sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \
|
||||
sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
|
||||
|
||||
@@ -1028,7 +1030,11 @@ ARG PG_VERSION
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \
|
||||
sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
cargo pgrx install --release
|
||||
|
||||
#########################################################################################
|
||||
|
||||
@@ -31,7 +31,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::Parser;
|
||||
use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
|
||||
use nix::unistd::Pid;
|
||||
use tracing::{info, info_span, warn, Instrument};
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use utils::fs_ext::is_directory_empty;
|
||||
|
||||
#[path = "fast_import/aws_s3_sync.rs"]
|
||||
@@ -41,12 +41,19 @@ mod child_stdio_to_log;
|
||||
#[path = "fast_import/s3_uri.rs"]
|
||||
mod s3_uri;
|
||||
|
||||
const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
|
||||
const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
struct Args {
|
||||
#[clap(long)]
|
||||
working_directory: Utf8PathBuf,
|
||||
#[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
|
||||
s3_prefix: s3_uri::S3Uri,
|
||||
s3_prefix: Option<s3_uri::S3Uri>,
|
||||
#[clap(long)]
|
||||
source_connection_string: Option<String>,
|
||||
#[clap(short, long)]
|
||||
interactive: bool,
|
||||
#[clap(long)]
|
||||
pg_bin_dir: Utf8PathBuf,
|
||||
#[clap(long)]
|
||||
@@ -77,30 +84,70 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
|
||||
info!("starting");
|
||||
|
||||
let Args {
|
||||
working_directory,
|
||||
s3_prefix,
|
||||
pg_bin_dir,
|
||||
pg_lib_dir,
|
||||
} = Args::parse();
|
||||
let args = Args::parse();
|
||||
|
||||
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
// Validate arguments
|
||||
if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
|
||||
anyhow::bail!("either s3_prefix or source_connection_string must be specified");
|
||||
}
|
||||
if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
|
||||
anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
|
||||
}
|
||||
|
||||
let spec: Spec = {
|
||||
let spec_key = s3_prefix.append("/spec.json");
|
||||
let s3_client = aws_sdk_s3::Client::new(&aws_config);
|
||||
let object = s3_client
|
||||
.get_object()
|
||||
.bucket(&spec_key.bucket)
|
||||
.key(spec_key.key)
|
||||
.send()
|
||||
.await
|
||||
.context("get spec from s3")?
|
||||
.body
|
||||
.collect()
|
||||
.await
|
||||
.context("download spec body")?;
|
||||
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
|
||||
let working_directory = args.working_directory;
|
||||
let pg_bin_dir = args.pg_bin_dir;
|
||||
let pg_lib_dir = args.pg_lib_dir;
|
||||
|
||||
// Initialize AWS clients only if s3_prefix is specified
|
||||
let (aws_config, kms_client) = if args.s3_prefix.is_some() {
|
||||
let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
let kms = aws_sdk_kms::Client::new(&config);
|
||||
(Some(config), Some(kms))
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
// Get source connection string either from S3 spec or direct argument
|
||||
let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
|
||||
let spec: Spec = {
|
||||
let spec_key = s3_prefix.append("/spec.json");
|
||||
let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
|
||||
let object = s3_client
|
||||
.get_object()
|
||||
.bucket(&spec_key.bucket)
|
||||
.key(spec_key.key)
|
||||
.send()
|
||||
.await
|
||||
.context("get spec from s3")?
|
||||
.body
|
||||
.collect()
|
||||
.await
|
||||
.context("download spec body")?;
|
||||
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
|
||||
};
|
||||
|
||||
match spec.encryption_secret {
|
||||
EncryptionSecret::KMS { key_id } => {
|
||||
let mut output = kms_client
|
||||
.unwrap()
|
||||
.decrypt()
|
||||
.key_id(key_id)
|
||||
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
|
||||
spec.source_connstring_ciphertext_base64,
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
.context("decrypt source connection string")?;
|
||||
let plaintext = output
|
||||
.plaintext
|
||||
.take()
|
||||
.context("get plaintext source connection string")?;
|
||||
String::from_utf8(plaintext.into_inner())
|
||||
.context("parse source connection string as utf8")?
|
||||
}
|
||||
}
|
||||
} else {
|
||||
args.source_connection_string.unwrap()
|
||||
};
|
||||
|
||||
match tokio::fs::create_dir(&working_directory).await {
|
||||
@@ -123,15 +170,6 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.await
|
||||
.context("create pgdata directory")?;
|
||||
|
||||
//
|
||||
// Setup clients
|
||||
//
|
||||
let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
|
||||
let kms_client = aws_sdk_kms::Client::new(&aws_config);
|
||||
|
||||
//
|
||||
// Initialize pgdata
|
||||
//
|
||||
let pgbin = pg_bin_dir.join("postgres");
|
||||
let pg_version = match get_pg_version(pgbin.as_ref()) {
|
||||
PostgresMajorVersion::V14 => 14,
|
||||
@@ -170,7 +208,13 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.args(["-c", &format!("max_parallel_workers={nproc}")])
|
||||
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
|
||||
.args(["-c", &format!("max_worker_processes={nproc}")])
|
||||
.args(["-c", "effective_io_concurrency=100"])
|
||||
.args([
|
||||
"-c",
|
||||
&format!(
|
||||
"effective_io_concurrency={}",
|
||||
if cfg!(target_os = "macos") { 0 } else { 100 }
|
||||
),
|
||||
])
|
||||
.env_clear()
|
||||
.stdout(std::process::Stdio::piped())
|
||||
.stderr(std::process::Stdio::piped())
|
||||
@@ -185,44 +229,58 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.instrument(info_span!("postgres")),
|
||||
);
|
||||
|
||||
// Create neondb database in the running postgres
|
||||
let restore_pg_connstring =
|
||||
format!("host=localhost port=5432 user={superuser} dbname=postgres");
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await;
|
||||
if res.is_ok() {
|
||||
info!("postgres is ready, could connect to it");
|
||||
break;
|
||||
if start_time.elapsed() > PG_WAIT_TIMEOUT {
|
||||
error!(
|
||||
"timeout exceeded: failed to poll postgres and create database within 10 minutes"
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
|
||||
Ok((client, connection)) => {
|
||||
// Spawn the connection handling task to maintain the connection
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
warn!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
match client.simple_query("CREATE DATABASE neondb;").await {
|
||||
Ok(_) => {
|
||||
info!("created neondb database");
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"failed to create database: {}, retying in {}s",
|
||||
e,
|
||||
PG_WAIT_RETRY_INTERVAL.as_secs_f32()
|
||||
);
|
||||
tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
info!(
|
||||
"postgres not ready yet, retrying in {}s",
|
||||
PG_WAIT_RETRY_INTERVAL.as_secs_f32()
|
||||
);
|
||||
tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Decrypt connection string
|
||||
//
|
||||
let source_connection_string = {
|
||||
match spec.encryption_secret {
|
||||
EncryptionSecret::KMS { key_id } => {
|
||||
let mut output = kms_client
|
||||
.decrypt()
|
||||
.key_id(key_id)
|
||||
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
|
||||
spec.source_connstring_ciphertext_base64,
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
.context("decrypt source connection string")?;
|
||||
let plaintext = output
|
||||
.plaintext
|
||||
.take()
|
||||
.context("get plaintext source connection string")?;
|
||||
String::from_utf8(plaintext.into_inner())
|
||||
.context("parse source connection string as utf8")?
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Start the work
|
||||
//
|
||||
let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
|
||||
|
||||
let dumpdir = working_directory.join("dumpdir");
|
||||
|
||||
@@ -310,6 +368,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// If interactive mode, wait for Ctrl+C
|
||||
if args.interactive {
|
||||
info!("Running in interactive mode. Press Ctrl+C to shut down.");
|
||||
tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
|
||||
}
|
||||
|
||||
info!("shutdown postgres");
|
||||
{
|
||||
nix::sys::signal::kill(
|
||||
@@ -325,21 +389,24 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.context("wait for postgres to shut down")?;
|
||||
}
|
||||
|
||||
info!("upload pgdata");
|
||||
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
|
||||
.await
|
||||
.context("sync dump directory to destination")?;
|
||||
|
||||
info!("write status");
|
||||
{
|
||||
let status_dir = working_directory.join("status");
|
||||
std::fs::create_dir(&status_dir).context("create status directory")?;
|
||||
let status_file = status_dir.join("pgdata");
|
||||
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
|
||||
.context("write status file")?;
|
||||
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
|
||||
// Only sync if s3_prefix was specified
|
||||
if let Some(s3_prefix) = args.s3_prefix {
|
||||
info!("upload pgdata");
|
||||
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
|
||||
.await
|
||||
.context("sync status directory to destination")?;
|
||||
.context("sync dump directory to destination")?;
|
||||
|
||||
info!("write status");
|
||||
{
|
||||
let status_dir = working_directory.join("status");
|
||||
std::fs::create_dir(&status_dir).context("create status directory")?;
|
||||
let status_file = status_dir.join("pgdata");
|
||||
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
|
||||
.context("write status file")?;
|
||||
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
|
||||
.await
|
||||
.context("sync status directory to destination")?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -9,8 +9,9 @@ use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
|
||||
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
|
||||
ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
|
||||
TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
@@ -231,6 +232,13 @@ enum Command {
|
||||
},
|
||||
/// List safekeepers known to the storage controller
|
||||
Safekeepers {},
|
||||
/// Set the scheduling policy of the specified safekeeper
|
||||
SafekeeperScheduling {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
scheduling_policy: SkSchedulingPolicyArg,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -283,6 +291,24 @@ impl FromStr for PlacementPolicyArg {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct SkSchedulingPolicyArg(SkSchedulingPolicy);
|
||||
|
||||
impl FromStr for SkSchedulingPolicyArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self(SkSchedulingPolicy::Active)),
|
||||
"disabled" => Ok(Self(SkSchedulingPolicy::Disabled)),
|
||||
"decomissioned" => Ok(Self(SkSchedulingPolicy::Decomissioned)),
|
||||
_ => Err(anyhow::anyhow!(
|
||||
"Unknown scheduling policy '{s}', try active,disabled,decomissioned"
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
|
||||
|
||||
@@ -477,16 +503,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
println!("{table}");
|
||||
}
|
||||
Command::Tenants { node_id: None } => {
|
||||
let mut resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/tenant".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
|
||||
|
||||
// Set up output formatting
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"TenantId",
|
||||
@@ -496,20 +513,55 @@ async fn main() -> anyhow::Result<()> {
|
||||
"Placement",
|
||||
"Scheduling",
|
||||
]);
|
||||
for tenant in resp {
|
||||
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||
table.add_row([
|
||||
format!("{}", tenant.tenant_id),
|
||||
shard_zero
|
||||
.preferred_az_id
|
||||
.as_ref()
|
||||
.cloned()
|
||||
.unwrap_or("".to_string()),
|
||||
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||
format!("{:?}", tenant.stripe_size),
|
||||
format!("{:?}", tenant.policy),
|
||||
format!("{:?}", shard_zero.scheduling_policy),
|
||||
]);
|
||||
|
||||
// Pagination loop over listing API
|
||||
let mut start_after = None;
|
||||
const LIMIT: usize = 1000;
|
||||
loop {
|
||||
let path = match start_after {
|
||||
None => format!("control/v1/tenant?limit={LIMIT}"),
|
||||
Some(start_after) => {
|
||||
format!("control/v1/tenant?limit={LIMIT}&start_after={start_after}")
|
||||
}
|
||||
};
|
||||
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(Method::GET, path, None)
|
||||
.await?;
|
||||
|
||||
if resp.is_empty() {
|
||||
// End of data reached
|
||||
break;
|
||||
}
|
||||
|
||||
// Give some visual feedback while we're building up the table (comfy_table doesn't have
|
||||
// streaming output)
|
||||
if resp.len() >= LIMIT {
|
||||
eprint!(".");
|
||||
}
|
||||
|
||||
start_after = Some(resp.last().unwrap().tenant_id);
|
||||
|
||||
for tenant in resp {
|
||||
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||
table.add_row([
|
||||
format!("{}", tenant.tenant_id),
|
||||
shard_zero
|
||||
.preferred_az_id
|
||||
.as_ref()
|
||||
.cloned()
|
||||
.unwrap_or("".to_string()),
|
||||
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||
format!("{:?}", tenant.stripe_size),
|
||||
format!("{:?}", tenant.policy),
|
||||
format!("{:?}", shard_zero.scheduling_policy),
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
// Terminate progress dots
|
||||
if table.row_count() > LIMIT {
|
||||
eprint!("");
|
||||
}
|
||||
|
||||
println!("{table}");
|
||||
@@ -1176,6 +1228,23 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::SafekeeperScheduling {
|
||||
node_id,
|
||||
scheduling_policy,
|
||||
} => {
|
||||
let scheduling_policy = scheduling_policy.0;
|
||||
storcon_client
|
||||
.dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
|
||||
Method::POST,
|
||||
format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
|
||||
Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
|
||||
)
|
||||
.await?;
|
||||
println!(
|
||||
"Scheduling policy of {node_id} set to {}",
|
||||
String::from(scheduling_policy)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -81,7 +81,7 @@ configuration generation in them is less than its current one. Namely, it
|
||||
refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
|
||||
response it sends its current configuration generation to let walproposer know.
|
||||
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
|
||||
accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
|
||||
current one and ignores it otherwise. In any case it replies with
|
||||
```
|
||||
@@ -103,7 +103,7 @@ currently and tries to communicate with all of them. However, the list does not
|
||||
define consensus members. Instead, on start walproposer tracks highest
|
||||
configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
|
||||
from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
|
||||
establishes this configuration as its own and moves to voting.
|
||||
establishes this configuration as its own and moves to voting.
|
||||
|
||||
It should stop talking to safekeepers not listed in the configuration at this
|
||||
point, though it is not unsafe to continue doing so.
|
||||
@@ -119,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts.
|
||||
The following algorithm can be executed anywhere having access to configuration
|
||||
storage and safekeepers. It is safe to interrupt / restart it and run multiple
|
||||
instances of it concurrently, though likely one of them won't make
|
||||
progress then. It accepts `desired_set: Vec<NodeId>` as input.
|
||||
progress then. It accepts `desired_set: Vec<NodeId>` as input.
|
||||
|
||||
Algorithm will refuse to make the change if it encounters previous interrupted
|
||||
change attempt, but in this case it will try to finish it.
|
||||
@@ -140,7 +140,7 @@ storage are reachable.
|
||||
safe. Failed CAS aborts the procedure.
|
||||
4) Call `PUT` `configuration` on safekeepers from the current set,
|
||||
delivering them `joint_conf`. Collecting responses from majority is required
|
||||
to proceed. If any response returned generation higher than
|
||||
to proceed. If any response returned generation higher than
|
||||
`joint_conf.generation`, abort (another switch raced us). Otherwise, choose
|
||||
max `<last_log_term, flush_lsn>` among responses and establish it as
|
||||
(in memory) `sync_position`. Also choose max `term` and establish it as (in
|
||||
@@ -149,49 +149,49 @@ storage are reachable.
|
||||
without ack from the new set. Similarly, we'll bump term on new majority
|
||||
to `sync_term` so that two computes with the same term are never elected.
|
||||
4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
|
||||
doesn't exist yet by doing `pull_timeline` from the majority of the
|
||||
doesn't exist yet by doing `pull_timeline` from the majority of the
|
||||
current set. Doing that on majority of `new_sk_set` is enough to
|
||||
proceed, but it is reasonable to ensure that all `new_sk_set` members
|
||||
are initialized -- if some of them are down why are we migrating there?
|
||||
5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
|
||||
5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
|
||||
Success on majority is enough.
|
||||
6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
|
||||
delivering them `joint_conf` and collecting their positions. This will
|
||||
switch them to the `joint_conf` which generally won't be needed
|
||||
switch them to the `joint_conf` which generally won't be needed
|
||||
because `pull_timeline` already includes it and plus additionally would be
|
||||
broadcast by compute. More importantly, we may proceed to the next step
|
||||
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
|
||||
`sync_position`. Similarly, on the happy path no waiting is not needed because
|
||||
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
|
||||
`sync_position`. Similarly, on the happy path no waiting is not needed because
|
||||
`pull_timeline` already includes it. However, we should double
|
||||
check to be safe. For example, timeline could have been created earlier e.g.
|
||||
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
|
||||
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
|
||||
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
|
||||
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
|
||||
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
|
||||
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
|
||||
storage under one more CAS.
|
||||
8) Call `PUT` `configuration` on safekeepers from the new set,
|
||||
delivering them `new_conf`. It is enough to deliver it to the majority
|
||||
delivering them `new_conf`. It is enough to deliver it to the majority
|
||||
of the new set; the rest can be updated by compute.
|
||||
|
||||
I haven't put huge effort to make the description above very precise, because it
|
||||
is natural language prone to interpretations anyway. Instead I'd like to make TLA+
|
||||
spec of it.
|
||||
|
||||
Description above focuses on safety. To make the flow practical and live, here a few more
|
||||
Description above focuses on safety. To make the flow practical and live, here a few more
|
||||
considerations.
|
||||
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
|
||||
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
|
||||
step 3.
|
||||
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
|
||||
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
|
||||
it is safe to rollback to the old conf with one more CAS.
|
||||
3) On step 4 timeline might be already created on members of the new set for various reasons;
|
||||
3) On step 4 timeline might be already created on members of the new set for various reasons;
|
||||
the simplest is the procedure restart. There are more complicated scenarious like mentioned
|
||||
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
|
||||
generations, so seems simpler to treat existing timeline as success. However, this also
|
||||
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
|
||||
generations, so seems simpler to treat existing timeline as success. However, this also
|
||||
has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
|
||||
the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
|
||||
I don't think we'll observe this in practice, but can add waking up compute if needed.
|
||||
4) In the end timeline should be locally deleted on the safekeeper(s) which are
|
||||
in the old set but not in the new one, unless they are unreachable. To be
|
||||
safe this also should be done under generation number (deletion proceeds only if
|
||||
safe this also should be done under generation number (deletion proceeds only if
|
||||
current configuration is <= than one in request and safekeeper is not memeber of it).
|
||||
5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
|
||||
jump to step 7, using it as `new_conf`.
|
||||
@@ -202,47 +202,87 @@ The procedure ought to be driven from somewhere. Obvious candidates are control
|
||||
plane and storage_controller; and as each of them already has db we don't want
|
||||
yet another storage. I propose to manage safekeepers in storage_controller
|
||||
because 1) since it is in rust it simplifies simulation testing (more on this
|
||||
below) 2) it already manages pageservers.
|
||||
below) 2) it already manages pageservers.
|
||||
|
||||
This assumes that migration will be fully usable only after we migrate all
|
||||
tenants/timelines to storage_controller. It is discussible whether we want also
|
||||
to manage pageserver attachments for all of these, but likely we do.
|
||||
|
||||
This requires us to define storcon <-> cplane interface.
|
||||
This requires us to define storcon <-> cplane interface and changes.
|
||||
|
||||
### storage_controller <-> control plane interface
|
||||
### storage_controller <-> control plane interface and changes
|
||||
|
||||
First of all, control plane should
|
||||
[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
|
||||
storing safekeepers per timeline instead of per tenant because we can't migrate
|
||||
tenants atomically.
|
||||
tenants atomically.
|
||||
|
||||
The important question is how updated configuration is delivered from
|
||||
storage_controller to control plane to provide it to computes. As always, there
|
||||
are two options, pull and push. Let's do it the same push as with pageserver
|
||||
`/notify-attach` because 1) it keeps storage_controller out of critical compute
|
||||
start path 2) provides easier upgrade: there won't be such a thing as 'timeline
|
||||
managed by control plane / storcon', cplane just takes the value out of its db
|
||||
when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
|
||||
control plane until it succeeds.
|
||||
start path 2) uniformity. It makes storage_controller responsible for retrying
|
||||
notifying control plane until it succeeds.
|
||||
|
||||
So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
|
||||
updates it in the db if the provided conf generation is higher (the cplane db
|
||||
should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
|
||||
should update db which makes the call successful, and then try to schedule
|
||||
`apply_config` if possible, it is ok if not. storage_controller
|
||||
should rate limit calling the endpoint, but likely this won't be needed, as migration
|
||||
It is not needed for the control plane to fully know the `Configuration`. It is
|
||||
enough for it to only to be aware of the list of safekeepers in the latest
|
||||
configuration to supply it to compute, plus associated generation number to
|
||||
protect from stale update requests and to also pass it to compute.
|
||||
|
||||
So, cplane `/notify-safekeepers` for the timeline can accept JSON like
|
||||
```
|
||||
{
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
generation: u32,
|
||||
safekeepers: Vec<SafekeeperId>,
|
||||
}
|
||||
```
|
||||
where `SafekeeperId` is
|
||||
```
|
||||
{
|
||||
node_id: u64,
|
||||
host: String
|
||||
}
|
||||
```
|
||||
In principle `host` is redundant, but may be useful for observability.
|
||||
|
||||
The request updates list of safekeepers in the db if the provided conf
|
||||
generation is higher (the cplane db should also store generations for this).
|
||||
Similarly to
|
||||
[`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365),
|
||||
it should update db which makes the call successful, and then try to schedule
|
||||
`apply_config` if possible, it is ok if not. storage_controller should rate
|
||||
limit calling the endpoint, but likely this won't be needed, as migration
|
||||
throughput is limited by `pull_timeline`.
|
||||
|
||||
Timeline (branch) creation in cplane should call storage_controller POST
|
||||
`tenant/:tenant_id/timeline` like it currently does for sharded tenants.
|
||||
Response should be augmented with `safekeeper_conf: Configuration`. The call
|
||||
should be retried until succeeds.
|
||||
Response should be augmented with `safekeepers_generation` and `safekeepers`
|
||||
fields like described in `/notify-safekeepers` above. Initially (currently)
|
||||
these fields may be absent; in this case cplane chooses safekeepers on its own
|
||||
like it currently does. The call should be retried until succeeds.
|
||||
|
||||
Timeline deletion and tenant deletion in cplane should call appropriate
|
||||
storage_controller endpoints like it currently does for sharded tenants. The
|
||||
calls should be retried until they succeed.
|
||||
|
||||
When compute receives safekeepers list from control plane it needs to know the
|
||||
generation to checked whether it should be updated (note that compute may get
|
||||
safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
|
||||
GUC is just a comma separates list of `host:port`. Let's prefix it with
|
||||
`g#<generation>:` to this end, so it will look like
|
||||
```
|
||||
g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401
|
||||
```
|
||||
|
||||
To summarize, list of cplane changes:
|
||||
- per tenant -> per timeline safekeepers management and addition of int `safekeeper_generation` field.
|
||||
- `/notify-safekeepers` endpoint.
|
||||
- Branch creation call may return list of safekeepers and when it is
|
||||
present cplane should adopt it instead of choosing on its own like it does currently.
|
||||
- `neon.safekeepers` GUC should be prefixed with `g#<generation>:`.
|
||||
|
||||
### storage_controller implementation
|
||||
|
||||
Current 'load everything on startup and keep in memory' easy design is fine.
|
||||
@@ -360,10 +400,10 @@ source safekeeper might fail, which is not a problem if we are going to
|
||||
decomission the node but leaves garbage otherwise. I'd propose in the first version
|
||||
1) Don't attempt deletion at all if node status is `offline`.
|
||||
2) If it failed, just issue warning.
|
||||
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
|
||||
remove garbage timelines for manual use. It will 1) list all timelines on the
|
||||
safekeeper 2) compare each one against configuration storage: if timeline
|
||||
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
|
||||
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
|
||||
remove garbage timelines for manual use. It will 1) list all timelines on the
|
||||
safekeeper 2) compare each one against configuration storage: if timeline
|
||||
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
|
||||
be deleted under generation number if node is not member of current generation.
|
||||
|
||||
Automating this is untrivial; we'd need to register all potential missing
|
||||
@@ -412,8 +452,8 @@ There should be following layers of tests:
|
||||
3) Since simulation testing injects at relatively high level points (not
|
||||
syscalls), it omits some code, in particular `pull_timeline`. Thus it is
|
||||
better to have basic tests covering whole system as well. Extended version of
|
||||
`test_restarts_under_load` would do: start background load and do migration
|
||||
under it, then restart endpoint and check that no reported commits
|
||||
`test_restarts_under_load` would do: start background load and do migration
|
||||
under it, then restart endpoint and check that no reported commits
|
||||
had been lost. I'd also add one more creating classic network split scenario, with
|
||||
one compute talking to AC and another to BD while migration from nodes ABC to ABD
|
||||
happens.
|
||||
@@ -422,35 +462,51 @@ There should be following layers of tests:
|
||||
|
||||
## Order of implementation and rollout
|
||||
|
||||
Note that
|
||||
Note that
|
||||
- Control plane parts and integration with it is fully independent from everything else
|
||||
(tests would use simulation and neon_local).
|
||||
- It is reasonable to make compute <-> safekeepers protocol change
|
||||
independent of enabling generations.
|
||||
- There is a lot of infra work making storage_controller aware of timelines and safekeepers
|
||||
and its impl/rollout should be separate from migration itself.
|
||||
- Initially walproposer can just stop working while it observers joint configuration.
|
||||
- Initially walproposer can just stop working while it observes joint configuration.
|
||||
Such window would be typically very short anyway.
|
||||
- Obviously we want to test the whole thing thoroughly on staging and only then
|
||||
gradually enable in prod.
|
||||
|
||||
To rollout smoothly, both walproposer and safekeeper should have flag
|
||||
`configurations_enabled`; when set to false, they would work as currently, i.e.
|
||||
walproposer is able to commit on whatever safekeeper set it is provided. Until
|
||||
all timelines are managed by storcon we'd need to use current script to migrate
|
||||
and update/drop entries in the storage_controller database if it has any.
|
||||
Let's have the following implementation bits for gradual rollout:
|
||||
- compute gets `neon.safekeepers_proto_version` flag.
|
||||
Initially both compute and safekeepers will be able to talk both
|
||||
versions so that we can delay force restart of them and for
|
||||
simplicity of rollback in case it is needed.
|
||||
- storcon gets `-set-safekeepers` config option disabled by
|
||||
default. Timeline creation request chooses safekeepers
|
||||
(and returns them in response to cplane) only when it is set to
|
||||
true.
|
||||
- control_plane [see above](storage_controller-<->-control-plane interface-and-changes)
|
||||
prefixes `neon.safekeepers` GUC with generation number. When it is 0
|
||||
(or prefix not present at all), walproposer behaves as currently, committing on
|
||||
the provided safekeeper list -- generations are disabled.
|
||||
If it is non 0 it follows this RFC rules.
|
||||
- We provide a script for manual migration to storage controller.
|
||||
It selects timeline(s) from control plane (specified or all of them) db
|
||||
and calls special import endpoint on storage controller which is very
|
||||
similar to timeline creation: it inserts into the db, sets
|
||||
configuration to initial on the safekeepers, calls cplane
|
||||
`notify-safekeepers`.
|
||||
|
||||
Safekeepers would need to be able to talk both current and new protocol version
|
||||
with compute to reduce number of computes restarted in prod once v2 protocol is
|
||||
deployed (though before completely switching we'd need to force this).
|
||||
|
||||
Let's have the following rollout order:
|
||||
- storage_controller becomes aware of safekeepers;
|
||||
- storage_controller gets timeline creation for new timelines and deletion requests, but
|
||||
doesn't manage all timelines yet. Migration can be tested on these new timelines.
|
||||
To keep control plane and storage_controller databases in sync while control
|
||||
plane still chooses the safekeepers initially (until all timelines are imported
|
||||
it can choose better), `TimelineCreateRequest` can get optional safekeepers
|
||||
field with safekeepers chosen by cplane.
|
||||
- Then we can import all existing timelines from control plane to
|
||||
storage_controller and gradually enable configurations region by region.
|
||||
Then the rollout for a region would be:
|
||||
- Current situation: safekeepers are choosen by control_plane.
|
||||
- We manually migrate some timelines, test moving them around.
|
||||
- Then we enable `--set-safekeepers` so that all new timelines
|
||||
are on storage controller.
|
||||
- Finally migrate all existing timelines using the script (no
|
||||
compute should be speaking old proto version at this point).
|
||||
|
||||
Until all timelines are managed by storcon we'd need to use current ad hoc
|
||||
script to migrate if needed. To keep state clean, all storage controller managed
|
||||
timelines must be migrated before that, or controller db and configurations
|
||||
state of safekeepers dropped manually.
|
||||
|
||||
Very rough implementation order:
|
||||
- Add concept of configurations to safekeepers (including control file),
|
||||
@@ -458,10 +514,10 @@ Very rough implementation order:
|
||||
- Implement walproposer changes, including protocol.
|
||||
- Implement storconn part. Use it in neon_local (and pytest).
|
||||
- Make cplane store safekeepers per timeline instead of per tenant.
|
||||
- Implement cplane/storcon integration. Route branch creation/deletion
|
||||
- Implement cplane/storcon integration. Route branch creation/deletion
|
||||
through storcon. Then we can test migration of new branches.
|
||||
- Finally import existing branches. Then we can drop cplane
|
||||
safekeeper selection code. Gradually enable configurations at
|
||||
- Finally import existing branches. Then we can drop cplane
|
||||
safekeeper selection code. Gradually enable configurations at
|
||||
computes and safekeepers. Before that, all computes must talk only
|
||||
v3 protocol version.
|
||||
|
||||
|
||||
247
docs/rfcs/040-profiling.md
Normal file
247
docs/rfcs/040-profiling.md
Normal file
@@ -0,0 +1,247 @@
|
||||
# CPU and Memory Profiling
|
||||
|
||||
Created 2025-01-12 by Erik Grinaker.
|
||||
|
||||
See also [internal user guide](https://www.notion.so/neondatabase/Storage-CPU-Memory-Profiling-14bf189e004780228ec7d04442742324?pvs=4).
|
||||
|
||||
## Summary
|
||||
|
||||
This document proposes a standard cross-team pattern for CPU and memory profiling across
|
||||
applications and languages, using the [pprof](https://github.com/google/pprof) profile format.
|
||||
|
||||
It enables both ad hoc profiles via HTTP endpoints, and continuous profiling across the fleet via
|
||||
[Grafana Cloud Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/).
|
||||
Continuous profiling incurs an overhead of about 0.1% CPU usage and 3% slower heap allocations.
|
||||
|
||||
## Motivation
|
||||
|
||||
CPU and memory profiles are crucial observability tools for understanding performance issues,
|
||||
resource exhaustion, and resource costs. They allow answering questions like:
|
||||
|
||||
* Why is this process using 100% CPU?
|
||||
* How do I make this go faster?
|
||||
* Why did this process run out of memory?
|
||||
* Why are we paying for all these CPU cores and memory chips?
|
||||
|
||||
Go has [first-class support](https://pkg.go.dev/net/http/pprof) for profiling included in its
|
||||
standard library, using the [pprof](https://github.com/google/pprof) profile format and associated
|
||||
tooling.
|
||||
|
||||
This is not the case for Rust and C, where obtaining profiles can be rather cumbersome. It requires
|
||||
installing and running additional tools like `perf` as root on production nodes, with analysis tools
|
||||
that can be hard to use and often don't give good results. This is not only annoying, but can also
|
||||
significantly affect the resolution time of production incidents.
|
||||
|
||||
This proposal will:
|
||||
|
||||
* Provide CPU and heap profiles in pprof format via HTTP API.
|
||||
* Record continuous profiles in Grafana for aggregate historical analysis.
|
||||
* Make it easy for anyone to see a flamegraph in less than one minute.
|
||||
* Be reasonably consistent across teams and services (Rust, Go, C).
|
||||
|
||||
## Non Goals (For Now)
|
||||
|
||||
* [Additional profile types](https://grafana.com/docs/pyroscope/next/configure-client/profile-types/)
|
||||
like mutexes, locks, goroutines, etc.
|
||||
* [Runtime trace integration](https://grafana.com/docs/pyroscope/next/configure-client/trace-span-profiles/).
|
||||
* [Profile-guided optimization](https://en.wikipedia.org/wiki/Profile-guided_optimization).
|
||||
|
||||
## Using Profiles
|
||||
|
||||
Ready-to-use profiles can be obtained using e.g. `curl`. For Rust services:
|
||||
|
||||
```
|
||||
$ curl localhost:9898/profile/cpu >profile.pb.gz
|
||||
```
|
||||
|
||||
pprof profiles can be explored using the [`pprof`](https://github.com/google/pprof) web UI, which
|
||||
provides flamegraphs, call graphs, plain text listings, and more:
|
||||
|
||||
```
|
||||
$ pprof -http :6060 <profile>
|
||||
```
|
||||
|
||||
Some endpoints (e.g. Rust-based ones) can also generate flamegraph SVGs directly:
|
||||
|
||||
```
|
||||
$ curl localhost:9898/profile/cpu?format=svg >profile.svg
|
||||
$ open profile.svg
|
||||
```
|
||||
|
||||
Continuous profiles are available in Grafana under Explore → Profiles → Explore Profiles
|
||||
(currently only in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)).
|
||||
|
||||
## API Requirements
|
||||
|
||||
* HTTP endpoints that return a profile in pprof format (with symbols).
|
||||
* CPU: records a profile over the request time interval (`seconds` query parameter).
|
||||
* Memory: returns the current in-use heap allocations.
|
||||
* Unauthenticated, as it should not expose user data or pose a denial-of-service risk.
|
||||
* Default sample frequency should not impact service (maximum 5% CPU overhead).
|
||||
* Linux-compatibility.
|
||||
|
||||
Nice to have:
|
||||
|
||||
* Return flamegraph SVG directly from the HTTP endpoint if requested.
|
||||
* Configurable sample frequency for CPU profiles.
|
||||
* Historical heap allocations, by count and bytes.
|
||||
* macOS-compatiblity.
|
||||
|
||||
## Rust Profiling
|
||||
|
||||
[`libs/utils/src/http/endpoint.rs`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs)
|
||||
contains ready-to-use HTTP endpoints for CPU and memory profiling:
|
||||
[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338) and [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
|
||||
|
||||
### CPU
|
||||
|
||||
CPU profiles are provided by [pprof-rs](https://github.com/tikv/pprof-rs) via
|
||||
[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338).
|
||||
Expose it unauthenticated at `/profile/cpu`.
|
||||
|
||||
Parameters:
|
||||
|
||||
* `format`: profile output format (`pprof` or `svg`; default `pprof`).
|
||||
* `seconds`: duration to collect profile over, in seconds (default `5`).
|
||||
* `frequency`: how often to sample thread stacks, in Hz (default `99`).
|
||||
* `force`: if `true`, cancel a running profile and start a new one (default `false`).
|
||||
|
||||
Works on Linux and macOS.
|
||||
|
||||
### Memory
|
||||
|
||||
Use the jemalloc allocator via [`tikv-jemallocator`](https://github.com/tikv/jemallocator),
|
||||
and enable profiling with samples every 2 MB allocated:
|
||||
|
||||
```rust
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
|
||||
```
|
||||
|
||||
pprof profiles are generated by
|
||||
[`jemalloc-pprof`](https://github.com/polarsignals/rust-jemalloc-pprof) via
|
||||
[`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
|
||||
Expose it unauthenticated at `/profile/heap`.
|
||||
|
||||
Parameters:
|
||||
|
||||
* `format`: profile output format (`pprof`, `svg`, or `jemalloc`; default `pprof`).
|
||||
|
||||
Works on Linux only, due to [jemalloc limitations](https://github.com/jemalloc/jemalloc/issues/26).
|
||||
|
||||
## Go Profiling
|
||||
|
||||
The Go standard library includes pprof profiling via HTTP API in
|
||||
[`net/http/pprof`](https://pkg.go.dev/net/http/pprof). Expose it unauthenticated at
|
||||
`/debug/pprof`.
|
||||
|
||||
Works on Linux and macOS.
|
||||
|
||||
### CPU
|
||||
|
||||
Via `/debug/pprof/profile`. Parameters:
|
||||
|
||||
* `debug`: profile output format (`0` is pprof, `1` or above is plaintext; default `0`).
|
||||
* `seconds`: duration to collect profile over, in seconds (default `30`).
|
||||
|
||||
Does not support a frequency parameter (see [#57488](https://github.com/golang/go/issues/57488)),
|
||||
and defaults to 100 Hz. A lower frequency can be hardcoded via `SetCPUProfileRate`, but the default
|
||||
is likely ok (estimated 1% overhead).
|
||||
|
||||
### Memory
|
||||
|
||||
Via `/debug/pprof/heap`. Parameters:
|
||||
|
||||
* `seconds`: take a delta profile over the given duration, in seconds (default `0`).
|
||||
* `gc`: if `1`, garbage collect before taking profile.
|
||||
|
||||
## C Profiling
|
||||
|
||||
[gperftools](https://github.com/gperftools/gperftools) provides in-process CPU and heap profiling
|
||||
with pprof output.
|
||||
|
||||
However, continuous profiling of PostgreSQL is expensive (many computes), and has limited value
|
||||
since we don't own the internals anyway.
|
||||
|
||||
Ad hoc profiling might still be useful, but the compute team considers existing tooling sufficient,
|
||||
so this is not a priority at the moment.
|
||||
|
||||
## Grafana Continuous Profiling
|
||||
|
||||
[Grafana Alloy](https://grafana.com/docs/alloy/latest/) continually scrapes CPU and memory profiles
|
||||
across the fleet, and archives them as time series. This can be used to analyze resource usage over
|
||||
time, either in aggregate or zoomed in to specific events and nodes.
|
||||
|
||||
Profiles are retained for 30 days. Profile ingestion volume for CPU+heap at 60-second intervals
|
||||
is about 0.5 GB/node/day, or about $0.25/node/day = $7.5/node/month ($0.50/GB).
|
||||
|
||||
It is currently enabled in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)
|
||||
for Pageserver and Safekeeper.
|
||||
|
||||
### Scraping
|
||||
|
||||
* CPU profiling: 59 seconds at 19 Hz every 60 seconds.
|
||||
* Heap profiling: heap snapshot with 2 MB frequency every 60 seconds.
|
||||
|
||||
There are two main approaches that can be taken for CPU profiles:
|
||||
|
||||
* Continuous low-frequency profiles (e.g. 19 Hz for 60 seconds every 60 seconds).
|
||||
* Occasional high-frequency profiles (e.g. 99 Hz for 5 seconds every 60 seconds).
|
||||
|
||||
We choose continuous low-frequency profiles where possible. This has a fixed low overhead, instead
|
||||
of a spiky high overhead. It likely also gives a more representative view of resource usage.
|
||||
However, a 19 Hz rate gives a minimum resolution of 52.6 ms per sample, which may be larger than the
|
||||
actual runtime of small functions. Note that Go does not support a frequency parameter, so we must
|
||||
use a fixed frequency for all profiles via `SetCPUProfileRate()` (default 100 Hz).
|
||||
|
||||
Only one CPU profile can be taken at a time. With continuous profiling, one will always be running.
|
||||
To allow also taking an ad hoc CPU profile, the Rust endpoint supports a `force` query parameter to
|
||||
cancel a running profile and start a new one.
|
||||
|
||||
### Overhead
|
||||
|
||||
With Rust:
|
||||
|
||||
* CPU profiles at 19 Hz frequency: 0.1% overhead.
|
||||
* Heap profiles at 2 MB frequency: 3% allocation overhead.
|
||||
* Profile call/encoding/symbolization: 20 ms every 60 seconds, or 0.03% of 1 CPU (for Pageserver).
|
||||
* Profile symbolization caches: 125 MB memory, or 0.4% of 32 GB (for Pageserver).
|
||||
|
||||
Benchmarks with pprof-rs showed that the CPU time for taking a stack trace of a 40-frame stack was
|
||||
11 µs using the `frame-pointer` feature, and 1.4 µs using `libunwind` with DWARF. `libunwind` saw
|
||||
frequent seg faults, so we use `frame-pointer` and build binaries with frame pointers (negligible
|
||||
overhead).
|
||||
|
||||
CPU profiles work by installing an `ITIMER_PROF` for the process, which triggers a `SIGPROF` signal
|
||||
after a given amount of cumulative CPU time across all CPUs. The signal handler will run for one
|
||||
of the currently executing threads and take a stack trace. Thus, a 19 Hz profile will take 1 stack
|
||||
trace every 52.6 ms CPU time -- assuming 11 µs for a stack trace, this is 0.02% overhead, but
|
||||
likely 0.1% in practice (given e.g. context switches).
|
||||
|
||||
Heap profiles work by probabilistically taking a stack trace on allocations, adjusted for the
|
||||
allocation size. A 1 MB allocation takes about 15 µs in benchmarks, and a stack trace about 1 µs,
|
||||
so we can estimate that a 2 MB sampling frequency has about 3% allocation overhead -- this is
|
||||
consistent with benchmarks. This is significantly larger than CPU profiles, but mitigated by the
|
||||
fact that performance-sensitive code will avoid allocations as far as possible.
|
||||
|
||||
Profile symbolization uses in-memory caches for symbol lookups. These take about 125 MB for
|
||||
Pageserver.
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
* eBPF profiles.
|
||||
* Don't require instrumenting the binary.
|
||||
* Use less resources.
|
||||
* Can profile in kernel space too.
|
||||
* Supported by Grafana.
|
||||
* Less information about stack frames and spans.
|
||||
* Limited tooling for local analysis.
|
||||
* Does not support heap profiles.
|
||||
* Does not work on macOS.
|
||||
|
||||
* [Polar Signals](https://www.polarsignals.com) instead of Grafana.
|
||||
* We already use Grafana for everything else. Appears good enough.
|
||||
@@ -416,8 +416,6 @@ pub struct MetadataHealthListOutdatedResponse {
|
||||
}
|
||||
|
||||
/// Publicly exposed safekeeper description
|
||||
///
|
||||
/// The `active` flag which we have in the DB is not included on purpose: it is deprecated.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperDescribeResponse {
|
||||
pub id: NodeId,
|
||||
@@ -433,6 +431,11 @@ pub struct SafekeeperDescribeResponse {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
@@ -24,7 +24,9 @@ pub struct Key {
|
||||
|
||||
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
|
||||
/// a struct of fields.
|
||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
|
||||
#[derive(
|
||||
Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug,
|
||||
)]
|
||||
pub struct CompactKey(i128);
|
||||
|
||||
/// The storage key size.
|
||||
|
||||
@@ -29,7 +29,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use crate::{
|
||||
key::Key,
|
||||
key::{CompactKey, Key},
|
||||
reltag::RelTag,
|
||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
||||
};
|
||||
@@ -1400,8 +1400,6 @@ pub enum PagestreamFeMessage {
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentRequest),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -1413,8 +1411,6 @@ pub enum PagestreamBeMessage {
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentResponse),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
@@ -1426,9 +1422,6 @@ enum PagestreamBeMessageTag {
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
GetSlruSegment = 105,
|
||||
/// Test message discrimimant is unstable
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 106,
|
||||
}
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
@@ -1440,8 +1433,6 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
106 => Ok(PagestreamBeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
@@ -1559,20 +1550,6 @@ pub struct PagestreamDbSizeResponse {
|
||||
pub db_size: i64,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct PagestreamTestRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub batch_key: u64,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamTestResponse {
|
||||
pub req: PagestreamTestRequest,
|
||||
}
|
||||
|
||||
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
|
||||
// that require pageserver-internal types. It is sufficient to get the total size.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -1641,17 +1618,6 @@ impl PagestreamFeMessage {
|
||||
bytes.put_u8(req.kind);
|
||||
bytes.put_u32(req.segno);
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(req) => {
|
||||
bytes.put_u8(5);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.batch_key);
|
||||
let message = req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
@@ -1739,21 +1705,6 @@ impl PagestreamFeMessage {
|
||||
segno: body.read_u32::<BigEndian>()?,
|
||||
},
|
||||
)),
|
||||
#[cfg(feature = "testing")]
|
||||
5 => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key: body.read_u64::<BigEndian>()?,
|
||||
message: {
|
||||
let len = body.read_u64::<BigEndian>()?;
|
||||
let mut buf = vec![0; len as usize];
|
||||
body.read_exact(&mut buf)?;
|
||||
String::from_utf8(buf)?
|
||||
},
|
||||
})),
|
||||
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
|
||||
}
|
||||
}
|
||||
@@ -1797,15 +1748,6 @@ impl PagestreamBeMessage {
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
PagestreamProtocolVersion::V3 => {
|
||||
@@ -1874,18 +1816,6 @@ impl PagestreamBeMessage {
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2028,28 +1958,6 @@ impl PagestreamBeMessage {
|
||||
segment: segment.into(),
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Tag::Test => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let batch_key = buf.read_u64::<BigEndian>()?;
|
||||
let len = buf.read_u64::<BigEndian>()?;
|
||||
let mut msg = vec![0; len as usize];
|
||||
buf.read_exact(&mut msg)?;
|
||||
let message = String::from_utf8(msg)?;
|
||||
Self::Test(PagestreamTestResponse {
|
||||
req: PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key,
|
||||
message,
|
||||
},
|
||||
})
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
@@ -2069,8 +1977,23 @@ impl PagestreamBeMessage {
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
Self::GetSlruSegment(_) => "GetSlruSegment",
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(_) => "Test",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct PageTraceEvent {
|
||||
pub key: CompactKey,
|
||||
pub effective_lsn: Lsn,
|
||||
pub time: SystemTime,
|
||||
}
|
||||
|
||||
impl Default for PageTraceEvent {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
key: Default::default(),
|
||||
effective_lsn: Default::default(),
|
||||
time: std::time::UNIX_EPOCH,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
||||
//! and their slugs are 0004, 0104, 0204, and 0304.
|
||||
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use crate::{key::Key, models::ShardParameters};
|
||||
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -48,6 +50,23 @@ pub struct ShardIdentity {
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
/// Hash implementation
|
||||
///
|
||||
/// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons.
|
||||
impl Hash for ShardIdentity {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
let ShardIdentity {
|
||||
number,
|
||||
count,
|
||||
stripe_size: _,
|
||||
layout: _,
|
||||
} = self;
|
||||
|
||||
number.0.hash(state);
|
||||
count.0.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
/// Stripe size in number of pages
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardStripeSize(pub u32);
|
||||
@@ -59,7 +78,7 @@ impl Default for ShardStripeSize {
|
||||
}
|
||||
|
||||
/// Layout version: for future upgrades where we might change how the key->shard mapping works
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
|
||||
pub struct ShardLayout(u8);
|
||||
|
||||
const LAYOUT_V1: ShardLayout = ShardLayout(1);
|
||||
|
||||
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlMultiXactCreate {
|
||||
pub mid: MultiXactId,
|
||||
/* new MultiXact's ID */
|
||||
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlMultiXactTruncate {
|
||||
pub oldest_multi_db: Oid,
|
||||
/* to-be-truncated range of multixact offsets */
|
||||
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlRelmapUpdate {
|
||||
pub dbid: Oid, /* database ID, or 0 for shared map */
|
||||
pub tsid: Oid, /* database's tablespace, or pg_global */
|
||||
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlReploriginDrop {
|
||||
pub node_id: RepOriginId,
|
||||
}
|
||||
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlReploriginSet {
|
||||
pub remote_lsn: Lsn,
|
||||
pub node_id: RepOriginId,
|
||||
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlSmgrTruncate {
|
||||
pub blkno: BlockNumber,
|
||||
pub rnode: RelFileNode,
|
||||
@@ -984,7 +984,7 @@ impl XlDropDatabase {
|
||||
/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
|
||||
/// struct for commits and aborts.
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlXactParsedRecord {
|
||||
pub xid: TransactionId,
|
||||
pub info: u8,
|
||||
|
||||
@@ -5,9 +5,12 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
const_format.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
tokio.workspace = true
|
||||
utils.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
|
||||
@@ -4,12 +4,15 @@ use const_format::formatcp;
|
||||
use pq_proto::SystemId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub mod membership;
|
||||
/// Public API types
|
||||
pub mod models;
|
||||
|
||||
/// Consensus logical timestamp. Note: it is a part of sk control file.
|
||||
pub type Term = u64;
|
||||
pub const INVALID_TERM: Term = 0;
|
||||
/// With this term timeline is created initially. It
|
||||
/// is a normal term except wp is never elected with it.
|
||||
pub const INITIAL_TERM: Term = 0;
|
||||
|
||||
/// Information about Postgres. Safekeeper gets it once and then verifies all
|
||||
/// further connections from computes match. Note: it is a part of sk control
|
||||
|
||||
166
libs/safekeeper_api/src/membership.rs
Normal file
166
libs/safekeeper_api/src/membership.rs
Normal file
@@ -0,0 +1,166 @@
|
||||
//! Types defining safekeeper membership, see
|
||||
//! rfcs/035-safekeeper-dynamic-membership-change.md
|
||||
//! for details.
|
||||
|
||||
use std::{collections::HashSet, fmt::Display};
|
||||
|
||||
use anyhow;
|
||||
use anyhow::bail;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
/// Number uniquely identifying safekeeper configuration.
|
||||
/// Note: it is a part of sk control file.
|
||||
pub type Generation = u32;
|
||||
/// 1 is the first valid generation, 0 is used as
|
||||
/// a placeholder before we fully migrate to generations.
|
||||
pub const INVALID_GENERATION: Generation = 0;
|
||||
pub const INITIAL_GENERATION: Generation = 1;
|
||||
|
||||
/// Membership is defined by ids so e.g. walproposer uses them to figure out
|
||||
/// quorums, but we also carry host and port to give wp idea where to connect.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct SafekeeperId {
|
||||
pub id: NodeId,
|
||||
pub host: String,
|
||||
/// We include here only port for computes -- that is, pg protocol tenant
|
||||
/// only port, or wide pg protocol port if the former is not configured.
|
||||
pub pg_port: u16,
|
||||
}
|
||||
|
||||
impl Display for SafekeeperId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port)
|
||||
}
|
||||
}
|
||||
|
||||
/// Set of safekeepers.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(transparent)]
|
||||
pub struct MemberSet {
|
||||
pub members: Vec<SafekeeperId>,
|
||||
}
|
||||
|
||||
impl MemberSet {
|
||||
pub fn empty() -> Self {
|
||||
MemberSet {
|
||||
members: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
|
||||
let hs: HashSet<NodeId> = HashSet::from_iter(members.iter().map(|sk| sk.id));
|
||||
if hs.len() != members.len() {
|
||||
bail!("duplicate safekeeper id in the set {:?}", members);
|
||||
}
|
||||
Ok(MemberSet { members })
|
||||
}
|
||||
|
||||
pub fn contains(&self, sk: &SafekeeperId) -> bool {
|
||||
self.members.iter().any(|m| m.id == sk.id)
|
||||
}
|
||||
|
||||
pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
|
||||
if self.contains(&sk) {
|
||||
bail!(format!(
|
||||
"sk {} is already member of the set {}",
|
||||
sk.id, self
|
||||
));
|
||||
}
|
||||
self.members.push(sk);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for MemberSet {
|
||||
/// Display as a comma separated list of members.
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let sks_str = self
|
||||
.members
|
||||
.iter()
|
||||
.map(|m| m.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
write!(f, "({})", sks_str.join(", "))
|
||||
}
|
||||
}
|
||||
|
||||
/// Safekeeper membership configuration.
|
||||
/// Note: it is a part of both control file and http API.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct Configuration {
|
||||
/// Unique id.
|
||||
pub generation: Generation,
|
||||
/// Current members of the configuration.
|
||||
pub members: MemberSet,
|
||||
/// Some means it is a joint conf.
|
||||
pub new_members: Option<MemberSet>,
|
||||
}
|
||||
|
||||
impl Configuration {
|
||||
/// Used for pre-generations timelines, will be removed eventually.
|
||||
pub fn empty() -> Self {
|
||||
Configuration {
|
||||
generation: INVALID_GENERATION,
|
||||
members: MemberSet::empty(),
|
||||
new_members: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Configuration {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"gen={}, members={}, new_members={}",
|
||||
self.generation,
|
||||
self.members,
|
||||
self.new_members
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or(String::from("none"))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{MemberSet, SafekeeperId};
|
||||
use utils::id::NodeId;
|
||||
|
||||
#[test]
|
||||
fn test_member_set() {
|
||||
let mut members = MemberSet::empty();
|
||||
members
|
||||
.add(SafekeeperId {
|
||||
id: NodeId(42),
|
||||
host: String::from("lala.org"),
|
||||
pg_port: 5432,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
members
|
||||
.add(SafekeeperId {
|
||||
id: NodeId(42),
|
||||
host: String::from("lala.org"),
|
||||
pg_port: 5432,
|
||||
})
|
||||
.expect_err("duplicate must not be allowed");
|
||||
|
||||
members
|
||||
.add(SafekeeperId {
|
||||
id: NodeId(43),
|
||||
host: String::from("bubu.org"),
|
||||
pg_port: 5432,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
println!("members: {}", members);
|
||||
|
||||
let j = serde_json::to_string(&members).expect("failed to serialize");
|
||||
println!("members json: {}", j);
|
||||
assert_eq!(
|
||||
j,
|
||||
r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"#
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
//! Types used in safekeeper http API. Many of them are also reused internally.
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::TimestampTz;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::net::SocketAddr;
|
||||
@@ -11,7 +12,7 @@ use utils::{
|
||||
pageserver_feedback::PageserverFeedback,
|
||||
};
|
||||
|
||||
use crate::{ServerInfo, Term};
|
||||
use crate::{membership::Configuration, ServerInfo, Term};
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SafekeeperStatus {
|
||||
@@ -22,13 +23,16 @@ pub struct SafekeeperStatus {
|
||||
pub struct TimelineCreateRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub peer_ids: Option<Vec<NodeId>>,
|
||||
pub mconf: Configuration,
|
||||
pub pg_version: u32,
|
||||
pub system_id: Option<u64>,
|
||||
// By default WAL_SEGMENT_SIZE
|
||||
pub wal_seg_size: Option<u32>,
|
||||
pub commit_lsn: Lsn,
|
||||
// If not passed, it is assigned to the beginning of commit_lsn segment.
|
||||
pub local_start_lsn: Option<Lsn>,
|
||||
pub start_lsn: Lsn,
|
||||
// Normal creation should omit this field (start_lsn initializes all LSNs).
|
||||
// However, we allow specifying custom value higher than start_lsn for
|
||||
// manual recovery case, see test_s3_wal_replay.
|
||||
pub commit_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
/// Same as TermLsn, but serializes LSN using display serializer
|
||||
@@ -143,7 +147,13 @@ pub type ConnectionId = u32;
|
||||
|
||||
/// Serialize is used only for json'ing in API response. Also used internally.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalSenderState {
|
||||
pub enum WalSenderState {
|
||||
Vanilla(VanillaWalSenderState),
|
||||
Interpreted(InterpretedWalSenderState),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VanillaWalSenderState {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub addr: SocketAddr,
|
||||
pub conn_id: ConnectionId,
|
||||
@@ -152,6 +162,17 @@ pub struct WalSenderState {
|
||||
pub feedback: ReplicationFeedback,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct InterpretedWalSenderState {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub shard: ShardIdentity,
|
||||
pub addr: SocketAddr,
|
||||
pub conn_id: ConnectionId,
|
||||
// postgres application_name
|
||||
pub appname: Option<String>,
|
||||
pub feedback: ReplicationFeedback,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalReceiverState {
|
||||
/// None means it is recovery initiated by us (this safekeeper).
|
||||
@@ -172,6 +193,7 @@ pub enum WalReceiverStatus {
|
||||
pub struct TimelineStatus {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub mconf: Configuration,
|
||||
pub acceptor_state: AcceptorStateStatus,
|
||||
pub pg_info: ServerInfo,
|
||||
pub flush_lsn: Lsn,
|
||||
@@ -186,6 +208,20 @@ pub struct TimelineStatus {
|
||||
pub walreceivers: Vec<WalReceiverState>,
|
||||
}
|
||||
|
||||
/// Request to switch membership configuration.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct TimelineMembershipSwitchRequest {
|
||||
pub mconf: Configuration,
|
||||
}
|
||||
|
||||
/// In response both previous and current configuration are sent.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineMembershipSwitchResponse {
|
||||
pub previous_conf: Configuration,
|
||||
pub current_conf: Configuration,
|
||||
}
|
||||
|
||||
fn lsn_invalid() -> Lsn {
|
||||
Lsn::INVALID
|
||||
}
|
||||
|
||||
@@ -24,3 +24,18 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
camino.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
remote_storage.workspace = true
|
||||
tokio-util.workspace = true
|
||||
serde_json.workspace = true
|
||||
futures.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
pprof.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "bench_interpret_wal"
|
||||
harness = false
|
||||
|
||||
34
libs/wal_decoder/benches/README.md
Normal file
34
libs/wal_decoder/benches/README.md
Normal file
@@ -0,0 +1,34 @@
|
||||
## WAL Decoding and Interpretation Benchmarks
|
||||
|
||||
Note that these benchmarks pull WAL from a public bucket in S3
|
||||
as a preparation step. Hence, you need a way to auth with AWS.
|
||||
You can achieve this by copying the `~/.aws/config` file from
|
||||
the AWS SSO notion page and exporting `AWS_PROFILE=dev` when invoking
|
||||
the benchmarks.
|
||||
|
||||
To run benchmarks:
|
||||
|
||||
```sh
|
||||
aws sso login --profile dev
|
||||
|
||||
# All benchmarks.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder
|
||||
|
||||
# Specific file.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal
|
||||
|
||||
# Specific benchmark.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded
|
||||
|
||||
# List available benchmarks.
|
||||
cargo bench --package wal_decoder --benches -- --list
|
||||
|
||||
# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
|
||||
# Output in target/criterion/*/profile/flamegraph.svg.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded -- --profile-time 10
|
||||
```
|
||||
|
||||
Additional charts and statistics are available in `target/criterion/report/index.html`.
|
||||
|
||||
Benchmarks are automatically compared against the previous run. To compare against other runs, see
|
||||
`--baseline` and `--save-baseline`.
|
||||
250
libs/wal_decoder/benches/bench_interpret_wal.rs
Normal file
250
libs/wal_decoder/benches/bench_interpret_wal.rs
Normal file
@@ -0,0 +1,250 @@
|
||||
use anyhow::Context;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
|
||||
use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use serde::Deserialize;
|
||||
use std::{env, num::NonZeroUsize, sync::Arc};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use remote_storage::{
|
||||
DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind,
|
||||
S3Config,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
use wal_decoder::models::InterpretedWalRecord;
|
||||
|
||||
const S3_BUCKET: &str = "neon-github-public-dev";
|
||||
const S3_REGION: &str = "eu-central-1";
|
||||
const BUCKET_PREFIX: &str = "wal-snapshots/bulk-insert/";
|
||||
const METADATA_FILENAME: &str = "metadata.json";
|
||||
|
||||
/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
|
||||
/// This mirrors the configuration in bin/safekeeper.rs.
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
|
||||
|
||||
async fn create_s3_client() -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: S3_BUCKET.to_string(),
|
||||
bucket_region: S3_REGION.to_string(),
|
||||
prefix_in_bucket: Some(BUCKET_PREFIX.to_string()),
|
||||
endpoint: None,
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: None,
|
||||
upload_storage_class: None,
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
async fn download_bench_data(
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<Utf8TempDir> {
|
||||
let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?;
|
||||
let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?;
|
||||
|
||||
eprintln!("Downloading benchmark data to {:?}", temp_dir);
|
||||
|
||||
let listing = client
|
||||
.list(None, ListingMode::NoDelimiter, None, cancel)
|
||||
.await?;
|
||||
|
||||
let mut downloads = listing
|
||||
.keys
|
||||
.into_iter()
|
||||
.map(|obj| {
|
||||
let client = client.clone();
|
||||
let temp_dir_path = temp_dir.path().to_owned();
|
||||
|
||||
async move {
|
||||
let remote_path = obj.key;
|
||||
let download = client
|
||||
.download(&remote_path, &DownloadOpts::default(), cancel)
|
||||
.await?;
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let file_name = remote_path.object_name().unwrap();
|
||||
let file_path = temp_dir_path.join(file_name);
|
||||
let file = tokio::fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.write(true)
|
||||
.open(&file_path)
|
||||
.await?;
|
||||
|
||||
let mut writer = tokio::io::BufWriter::new(file);
|
||||
tokio::io::copy_buf(&mut body, &mut writer).await?;
|
||||
|
||||
Ok::<(), anyhow::Error>(())
|
||||
}
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
while let Some(download) = downloads.next().await {
|
||||
download?;
|
||||
}
|
||||
|
||||
Ok(temp_dir)
|
||||
}
|
||||
|
||||
struct BenchmarkData {
|
||||
wal: Vec<u8>,
|
||||
meta: BenchmarkMetadata,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct BenchmarkMetadata {
|
||||
pg_version: u32,
|
||||
start_lsn: Lsn,
|
||||
}
|
||||
|
||||
async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result<BenchmarkData> {
|
||||
eprintln!("Loading benchmark data from {:?}", path);
|
||||
|
||||
let mut entries = tokio::fs::read_dir(path).await?;
|
||||
let mut ordered_segment_paths = Vec::new();
|
||||
let mut metadata = None;
|
||||
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
if entry.file_name() == METADATA_FILENAME {
|
||||
let bytes = tokio::fs::read(entry.path()).await?;
|
||||
metadata = Some(
|
||||
serde_json::from_slice::<BenchmarkMetadata>(&bytes)
|
||||
.context("failed to deserialize metadata.json")?,
|
||||
);
|
||||
} else {
|
||||
ordered_segment_paths.push(entry.path());
|
||||
}
|
||||
}
|
||||
|
||||
ordered_segment_paths.sort();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for path in ordered_segment_paths {
|
||||
if buffer.len() >= input_size {
|
||||
break;
|
||||
}
|
||||
|
||||
use async_compression::tokio::bufread::ZstdDecoder;
|
||||
let file = tokio::fs::File::open(path).await?;
|
||||
let reader = tokio::io::BufReader::new(file);
|
||||
let decoder = ZstdDecoder::new(reader);
|
||||
let mut reader = tokio::io::BufReader::new(decoder);
|
||||
tokio::io::copy_buf(&mut reader, &mut buffer).await?;
|
||||
}
|
||||
|
||||
buffer.truncate(input_size);
|
||||
|
||||
Ok(BenchmarkData {
|
||||
wal: buffer,
|
||||
meta: metadata.unwrap(),
|
||||
})
|
||||
}
|
||||
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
const INPUT_SIZE: usize = 128 * 1024 * 1024;
|
||||
|
||||
let setup_runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let (_temp_dir, bench_data) = setup_runtime.block_on(async move {
|
||||
let cancel = CancellationToken::new();
|
||||
let client = create_s3_client().await.unwrap();
|
||||
let temp_dir = download_bench_data(client, &cancel).await.unwrap();
|
||||
let bench_data = load_bench_data(temp_dir.path(), INPUT_SIZE).await.unwrap();
|
||||
|
||||
(temp_dir, bench_data)
|
||||
});
|
||||
|
||||
eprintln!(
|
||||
"Benchmarking against {} MiB of WAL",
|
||||
INPUT_SIZE / 1024 / 1024
|
||||
);
|
||||
|
||||
let mut group = c.benchmark_group("decode-interpret-wal");
|
||||
group.throughput(criterion::Throughput::Bytes(bench_data.wal.len() as u64));
|
||||
group.sample_size(10);
|
||||
|
||||
group.bench_function("unsharded", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &[ShardIdentity::unsharded()]))
|
||||
});
|
||||
|
||||
let eight_shards = (0..8)
|
||||
.map(|i| ShardIdentity::new(ShardNumber(i), ShardCount(8), ShardStripeSize(8)).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
group.bench_function("8/8-shards", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &eight_shards))
|
||||
});
|
||||
|
||||
let four_shards = eight_shards
|
||||
.into_iter()
|
||||
.filter(|s| s.number.0 % 2 == 0)
|
||||
.collect::<Vec<_>>();
|
||||
group.bench_function("4/8-shards", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &four_shards))
|
||||
});
|
||||
|
||||
let two_shards = four_shards
|
||||
.into_iter()
|
||||
.filter(|s| s.number.0 % 4 == 0)
|
||||
.collect::<Vec<_>>();
|
||||
group.bench_function("2/8-shards", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &two_shards))
|
||||
});
|
||||
}
|
||||
|
||||
fn decode_interpret_main(bench: &BenchmarkData, shards: &[ShardIdentity]) {
|
||||
let r = decode_interpret(bench, shards);
|
||||
if let Err(e) = r {
|
||||
panic!("{e:?}");
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_interpret(bench: &BenchmarkData, shard: &[ShardIdentity]) -> anyhow::Result<()> {
|
||||
let mut decoder = WalStreamDecoder::new(bench.meta.start_lsn, bench.meta.pg_version);
|
||||
let xlogoff: usize = bench.meta.start_lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
|
||||
for chunk in bench.wal[xlogoff..].chunks(MAX_SEND_SIZE) {
|
||||
decoder.feed_bytes(chunk);
|
||||
while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
|
||||
assert!(lsn.is_aligned());
|
||||
let _ = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
shard,
|
||||
lsn,
|
||||
bench.meta.pg_version,
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
criterion_group!(
|
||||
name=benches;
|
||||
config=Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets=criterion_benchmark
|
||||
);
|
||||
criterion_main!(benches);
|
||||
@@ -1,6 +1,8 @@
|
||||
//! This module contains logic for decoding and interpreting
|
||||
//! raw bytes which represent a raw Postgres WAL record.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::models::*;
|
||||
use crate::serialized_batch::SerializedValueBatch;
|
||||
use bytes::{Buf, Bytes};
|
||||
@@ -14,15 +16,15 @@ use utils::lsn::Lsn;
|
||||
|
||||
impl InterpretedWalRecord {
|
||||
/// Decode and interpreted raw bytes which represent one Postgres WAL record.
|
||||
/// Data blocks which do not match the provided shard identity are filtered out.
|
||||
/// Data blocks which do not match any of the provided shard identities are filtered out.
|
||||
/// Shard 0 is a special case since it tracks all relation sizes. We only give it
|
||||
/// the keys that are being written as that is enough for updating relation sizes.
|
||||
pub fn from_bytes_filtered(
|
||||
buf: Bytes,
|
||||
shard: &ShardIdentity,
|
||||
shards: &[ShardIdentity],
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<InterpretedWalRecord> {
|
||||
) -> anyhow::Result<HashMap<ShardIdentity, InterpretedWalRecord>> {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(buf, &mut decoded, pg_version)?;
|
||||
let xid = decoded.xl_xid;
|
||||
@@ -33,43 +35,57 @@ impl InterpretedWalRecord {
|
||||
FlushUncommittedRecords::No
|
||||
};
|
||||
|
||||
let metadata_record =
|
||||
MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
|
||||
let batch = SerializedValueBatch::from_decoded_filtered(
|
||||
let mut shard_records: HashMap<ShardIdentity, InterpretedWalRecord> =
|
||||
HashMap::with_capacity(shards.len());
|
||||
for shard in shards {
|
||||
shard_records.insert(
|
||||
*shard,
|
||||
InterpretedWalRecord {
|
||||
metadata_record: None,
|
||||
batch: SerializedValueBatch::default(),
|
||||
next_record_lsn,
|
||||
flush_uncommitted,
|
||||
xid,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
MetadataRecord::from_decoded_filtered(
|
||||
&decoded,
|
||||
&mut shard_records,
|
||||
next_record_lsn,
|
||||
pg_version,
|
||||
)?;
|
||||
SerializedValueBatch::from_decoded_filtered(
|
||||
decoded,
|
||||
shard,
|
||||
&mut shard_records,
|
||||
next_record_lsn,
|
||||
pg_version,
|
||||
)?;
|
||||
|
||||
Ok(InterpretedWalRecord {
|
||||
metadata_record,
|
||||
batch,
|
||||
next_record_lsn,
|
||||
flush_uncommitted,
|
||||
xid,
|
||||
})
|
||||
Ok(shard_records)
|
||||
}
|
||||
}
|
||||
|
||||
impl MetadataRecord {
|
||||
/// Builds a metadata record for this WAL record, if any.
|
||||
/// Populates the given `shard_records` with metadata records from this WAL record, if any,
|
||||
/// discarding those belonging to other shards.
|
||||
///
|
||||
/// Only metadata records relevant for the given shard are emitted. Currently, most metadata
|
||||
/// Only metadata records relevant for the given shards is emitted. Currently, most metadata
|
||||
/// records are broadcast to all shards for simplicity, but this should be improved.
|
||||
fn from_decoded_filtered(
|
||||
decoded: &DecodedWALRecord,
|
||||
shard: &ShardIdentity,
|
||||
shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
) -> anyhow::Result<()> {
|
||||
// Note: this doesn't actually copy the bytes since
|
||||
// the [`Bytes`] type implements it via a level of indirection.
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
// First, generate metadata records from the decoded WAL record.
|
||||
let mut metadata_record = match decoded.xl_rmid {
|
||||
let metadata_record = match decoded.xl_rmid {
|
||||
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
|
||||
Self::decode_heapam_record(&mut buf, decoded, pg_version)?
|
||||
}
|
||||
@@ -112,41 +128,65 @@ impl MetadataRecord {
|
||||
};
|
||||
|
||||
// Next, filter the metadata record by shard.
|
||||
match metadata_record {
|
||||
Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
|
||||
) => {
|
||||
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
|
||||
// of the main relation. These are sharded and managed just like regular relation pages.
|
||||
// See: https://github.com/neondatabase/neon/issues/9855
|
||||
let is_local_vm_page = |heap_blk| {
|
||||
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
|
||||
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
|
||||
};
|
||||
// Send the old and new VM page updates to their respective shards.
|
||||
clear_vm_bits.old_heap_blkno = clear_vm_bits
|
||||
.old_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
clear_vm_bits.new_heap_blkno = clear_vm_bits
|
||||
.new_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
// If neither VM page belongs to this shard, discard the record.
|
||||
if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
|
||||
{
|
||||
metadata_record = None
|
||||
for (shard, record) in shard_records.iter_mut() {
|
||||
match metadata_record {
|
||||
Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref clear_vm_bits))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref clear_vm_bits)),
|
||||
) => {
|
||||
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
|
||||
// of the main relation. These are sharded and managed just like regular relation pages.
|
||||
// See: https://github.com/neondatabase/neon/issues/9855
|
||||
let is_local_vm_page = |heap_blk| {
|
||||
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
|
||||
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
|
||||
};
|
||||
// Send the old and new VM page updates to their respective shards.
|
||||
let updated_old_heap_blkno = clear_vm_bits
|
||||
.old_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
let updated_new_heap_blkno = clear_vm_bits
|
||||
.new_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
// If neither VM page belongs to this shard, discard the record.
|
||||
if updated_old_heap_blkno.is_some() || updated_new_heap_blkno.is_some() {
|
||||
// Clone the record and update it for the current shard.
|
||||
let mut for_shard = metadata_record.clone();
|
||||
match for_shard {
|
||||
Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
|
||||
ref mut clear_vm_bits,
|
||||
))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
|
||||
ref mut clear_vm_bits,
|
||||
)),
|
||||
) => {
|
||||
clear_vm_bits.old_heap_blkno = updated_old_heap_blkno;
|
||||
clear_vm_bits.new_heap_blkno = updated_new_heap_blkno;
|
||||
record.metadata_record = for_shard;
|
||||
}
|
||||
_ => {
|
||||
unreachable!("for_shard is a clone of what we checked above")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
|
||||
// Filter LogicalMessage records (AUX files) to only be stored on shard zero
|
||||
if shard.is_shard_zero() {
|
||||
record.metadata_record = metadata_record;
|
||||
// No other shards should receive this record, so we stop traversing shards early.
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// All other metadata records are sent to all shards.
|
||||
record.metadata_record = metadata_record.clone();
|
||||
}
|
||||
}
|
||||
Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
|
||||
// Filter LogicalMessage records (AUX files) to only be stored on shard zero
|
||||
if !shard.is_shard_zero() {
|
||||
metadata_record = None;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(metadata_record)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn decode_heapam_record(
|
||||
|
||||
@@ -48,7 +48,7 @@ pub mod proto {
|
||||
tonic::include_proto!("interpreted_wal");
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Copy, Clone, Serialize, Deserialize)]
|
||||
pub enum FlushUncommittedRecords {
|
||||
Yes,
|
||||
No,
|
||||
@@ -64,7 +64,7 @@ pub struct InterpretedWalRecords {
|
||||
}
|
||||
|
||||
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct InterpretedWalRecord {
|
||||
/// Optional metadata record - may cause writes to metadata keys
|
||||
/// in the storage engine
|
||||
@@ -107,7 +107,7 @@ impl InterpretedWalRecord {
|
||||
|
||||
/// The interpreted part of the Postgres WAL record which requires metadata
|
||||
/// writes to the underlying storage engine.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum MetadataRecord {
|
||||
Heapam(HeapamRecord),
|
||||
Neonrmgr(NeonrmgrRecord),
|
||||
@@ -123,12 +123,12 @@ pub enum MetadataRecord {
|
||||
Replorigin(ReploriginRecord),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum HeapamRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct ClearVmBits {
|
||||
pub new_heap_blkno: Option<u32>,
|
||||
pub old_heap_blkno: Option<u32>,
|
||||
@@ -136,29 +136,29 @@ pub struct ClearVmBits {
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum NeonrmgrRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum SmgrRecord {
|
||||
Create(SmgrCreate),
|
||||
Truncate(XlSmgrTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct SmgrCreate {
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum DbaseRecord {
|
||||
Create(DbaseCreate),
|
||||
Drop(DbaseDrop),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct DbaseCreate {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_id: Oid,
|
||||
@@ -166,32 +166,32 @@ pub struct DbaseCreate {
|
||||
pub src_tablespace_id: Oid,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct DbaseDrop {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_ids: Vec<Oid>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum ClogRecord {
|
||||
ZeroPage(ClogZeroPage),
|
||||
Truncate(ClogTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct ClogZeroPage {
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct ClogTruncate {
|
||||
pub pageno: u32,
|
||||
pub oldest_xid: TransactionId,
|
||||
pub oldest_xid_db: Oid,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum XactRecord {
|
||||
Commit(XactCommon),
|
||||
Abort(XactCommon),
|
||||
@@ -200,7 +200,7 @@ pub enum XactRecord {
|
||||
Prepare(XactPrepare),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct XactCommon {
|
||||
pub parsed: XlXactParsedRecord,
|
||||
pub origin_id: u16,
|
||||
@@ -209,73 +209,73 @@ pub struct XactCommon {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct XactPrepare {
|
||||
pub xl_xid: TransactionId,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum MultiXactRecord {
|
||||
ZeroPage(MultiXactZeroPage),
|
||||
Create(XlMultiXactCreate),
|
||||
Truncate(XlMultiXactTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct MultiXactZeroPage {
|
||||
pub slru_kind: SlruKind,
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum RelmapRecord {
|
||||
Update(RelmapUpdate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct RelmapUpdate {
|
||||
pub update: XlRelmapUpdate,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum XlogRecord {
|
||||
Raw(RawXlogRecord),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct RawXlogRecord {
|
||||
pub info: u8,
|
||||
pub lsn: Lsn,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum LogicalMessageRecord {
|
||||
Put(PutLogicalMessage),
|
||||
#[cfg(feature = "testing")]
|
||||
Failpoint,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct PutLogicalMessage {
|
||||
pub path: String,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum StandbyRecord {
|
||||
RunningXacts(StandbyRunningXacts),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct StandbyRunningXacts {
|
||||
pub oldest_running_xid: TransactionId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum ReploriginRecord {
|
||||
Set(XlReploriginSet),
|
||||
Drop(XlReploriginDrop),
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
//! Such batches are created from decoded PG wal records and ingested
|
||||
//! by the pageserver by writing directly to the ephemeral file.
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
@@ -22,6 +22,8 @@ use utils::lsn::Lsn;
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
|
||||
use crate::models::InterpretedWalRecord;
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
|
||||
/// Accompanying metadata for the batch
|
||||
@@ -30,7 +32,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
/// relation sizes. In the case of "observed" values, we only need to know
|
||||
/// the key and LSN, so two types of metadata are supported to save on network
|
||||
/// bandwidth.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub enum ValueMeta {
|
||||
Serialized(SerializedValueMeta),
|
||||
Observed(ObservedValueMeta),
|
||||
@@ -77,7 +79,7 @@ impl PartialEq for OrderedValueMeta {
|
||||
impl Eq for OrderedValueMeta {}
|
||||
|
||||
/// Metadata for a [`Value`] serialized into the batch.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SerializedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
@@ -89,14 +91,14 @@ pub struct SerializedValueMeta {
|
||||
}
|
||||
|
||||
/// Metadata for a [`Value`] observed by the batch
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct ObservedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Batch of serialized [`Value`]s.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct SerializedValueBatch {
|
||||
/// [`Value`]s serialized in EphemeralFile's native format,
|
||||
/// ready for disk write by the pageserver
|
||||
@@ -128,7 +130,8 @@ impl Default for SerializedValueBatch {
|
||||
}
|
||||
|
||||
impl SerializedValueBatch {
|
||||
/// Build a batch of serialized values from a decoded PG WAL record
|
||||
/// Populates the given `shard_records` with value batches from this WAL record, if any,
|
||||
/// discarding those belonging to other shards.
|
||||
///
|
||||
/// The batch will only contain values for keys targeting the specifiec
|
||||
/// shard. Shard 0 is a special case, where any keys that don't belong to
|
||||
@@ -136,21 +139,20 @@ impl SerializedValueBatch {
|
||||
/// but absent from the raw buffer [`SerializedValueBatch::raw`]).
|
||||
pub(crate) fn from_decoded_filtered(
|
||||
decoded: DecodedWALRecord,
|
||||
shard: &ShardIdentity,
|
||||
shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<SerializedValueBatch> {
|
||||
// First determine how big the buffer needs to be and allocate it up-front.
|
||||
) -> anyhow::Result<()> {
|
||||
// First determine how big the buffers need to be and allocate it up-front.
|
||||
// This duplicates some of the work below, but it's empirically much faster.
|
||||
let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
|
||||
let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
|
||||
for (shard, record) in shard_records.iter_mut() {
|
||||
assert!(record.batch.is_empty());
|
||||
|
||||
let estimate = Self::estimate_buffer_size(&decoded, shard, pg_version);
|
||||
record.batch.raw = Vec::with_capacity(estimate);
|
||||
}
|
||||
|
||||
let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
|
||||
let mut max_lsn: Lsn = Lsn(0);
|
||||
let mut len: usize = 0;
|
||||
for blk in decoded.blocks.iter() {
|
||||
let relative_off = buf.len() as u64;
|
||||
|
||||
let rel = RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
@@ -168,99 +170,98 @@ impl SerializedValueBatch {
|
||||
);
|
||||
}
|
||||
|
||||
let key_is_local = shard.is_key_local(&key);
|
||||
for (shard, record) in shard_records.iter_mut() {
|
||||
let key_is_local = shard.is_key_local(&key);
|
||||
|
||||
tracing::debug!(
|
||||
lsn=%next_record_lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {}",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
);
|
||||
tracing::debug!(
|
||||
lsn=%next_record_lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {}",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
);
|
||||
|
||||
if !key_is_local {
|
||||
if shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
metadata.push(ValueMeta::Observed(ObservedValueMeta {
|
||||
if !key_is_local {
|
||||
if shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
record
|
||||
.batch
|
||||
.metadata
|
||||
.push(ValueMeta::Observed(ObservedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: next_record_lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Instead of storing full-page-image WAL record,
|
||||
// it is better to store extracted image: we can skip wal-redo
|
||||
// in this case. Also some FPI records may contain multiple (up to 32) pages,
|
||||
// so them have to be copied multiple times.
|
||||
//
|
||||
let val = if Self::block_is_image(&decoded, blk, pg_version) {
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
|
||||
// TODO(vlad): skip the copy
|
||||
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
|
||||
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
//
|
||||
// Match the logic of XLogReadBufferForRedoExtended:
|
||||
// The page may be uninitialized. If so, we can't set the LSN because
|
||||
// that would corrupt the page.
|
||||
//
|
||||
if !page_is_new(&image) {
|
||||
page_set_lsn(&mut image, next_record_lsn)
|
||||
}
|
||||
assert_eq!(image.len(), BLCKSZ as usize);
|
||||
|
||||
Value::Image(image.freeze())
|
||||
} else {
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: decoded.record.clone(),
|
||||
})
|
||||
};
|
||||
|
||||
let relative_off = record.batch.raw.len() as u64;
|
||||
|
||||
val.ser_into(&mut record.batch.raw)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
let val_ser_size = record.batch.raw.len() - relative_off as usize;
|
||||
|
||||
record
|
||||
.batch
|
||||
.metadata
|
||||
.push(ValueMeta::Serialized(SerializedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: next_record_lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
continue;
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
}));
|
||||
record.batch.max_lsn = std::cmp::max(record.batch.max_lsn, next_record_lsn);
|
||||
record.batch.len += 1;
|
||||
}
|
||||
|
||||
// Instead of storing full-page-image WAL record,
|
||||
// it is better to store extracted image: we can skip wal-redo
|
||||
// in this case. Also some FPI records may contain multiple (up to 32) pages,
|
||||
// so them have to be copied multiple times.
|
||||
//
|
||||
let val = if Self::block_is_image(&decoded, blk, pg_version) {
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
|
||||
// TODO(vlad): skip the copy
|
||||
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
|
||||
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
//
|
||||
// Match the logic of XLogReadBufferForRedoExtended:
|
||||
// The page may be uninitialized. If so, we can't set the LSN because
|
||||
// that would corrupt the page.
|
||||
//
|
||||
if !page_is_new(&image) {
|
||||
page_set_lsn(&mut image, next_record_lsn)
|
||||
}
|
||||
assert_eq!(image.len(), BLCKSZ as usize);
|
||||
|
||||
Value::Image(image.freeze())
|
||||
} else {
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: decoded.record.clone(),
|
||||
})
|
||||
};
|
||||
|
||||
val.ser_into(&mut buf)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
let val_ser_size = buf.len() - relative_off as usize;
|
||||
|
||||
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: next_record_lsn,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
}));
|
||||
max_lsn = std::cmp::max(max_lsn, next_record_lsn);
|
||||
len += 1;
|
||||
}
|
||||
|
||||
if cfg!(any(debug_assertions, test)) {
|
||||
let batch = Self {
|
||||
raw: buf,
|
||||
metadata,
|
||||
max_lsn,
|
||||
len,
|
||||
};
|
||||
|
||||
batch.validate_lsn_order();
|
||||
|
||||
return Ok(batch);
|
||||
// Validate that the batches are correct
|
||||
for record in shard_records.values() {
|
||||
record.batch.validate_lsn_order();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
raw: buf,
|
||||
metadata,
|
||||
max_lsn,
|
||||
len,
|
||||
})
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look into the decoded PG WAL record and determine
|
||||
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
@@ -16,6 +16,7 @@ arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
bit_field.workspace = true
|
||||
bincode.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
@@ -113,7 +114,3 @@ harness = false
|
||||
[[bench]]
|
||||
name = "upload_queue"
|
||||
harness = false
|
||||
|
||||
[[bin]]
|
||||
name = "test_helper_slow_client_reads"
|
||||
required-features = [ "testing" ]
|
||||
|
||||
@@ -4,9 +4,6 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = [ "pageserver_api/testing" ]
|
||||
|
||||
[dependencies]
|
||||
pageserver_api.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::pin::Pin;
|
||||
|
||||
use futures::{
|
||||
stream::{SplitSink, SplitStream},
|
||||
SinkExt, StreamExt,
|
||||
};
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
|
||||
@@ -13,6 +10,7 @@ use pageserver_api::{
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_postgres::CopyOutStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -64,28 +62,15 @@ impl Client {
|
||||
.client
|
||||
.copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
|
||||
.await?;
|
||||
let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away.
|
||||
let Client {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
client: _,
|
||||
} = self;
|
||||
let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning(
|
||||
ConnTaskRunning {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
},
|
||||
)));
|
||||
Ok(PagestreamClient {
|
||||
sink: PagestreamSender {
|
||||
shared: shared.clone(),
|
||||
sink,
|
||||
},
|
||||
stream: PagestreamReceiver {
|
||||
shared: shared.clone(),
|
||||
stream,
|
||||
},
|
||||
shared,
|
||||
copy_both: Box::pin(copy_both),
|
||||
conn_task,
|
||||
cancel_on_client_drop,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -112,28 +97,7 @@ impl Client {
|
||||
|
||||
/// Create using [`Client::pagestream`].
|
||||
pub struct PagestreamClient {
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
sink: PagestreamSender,
|
||||
stream: PagestreamReceiver,
|
||||
}
|
||||
|
||||
pub struct PagestreamSender {
|
||||
#[allow(dead_code)]
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
sink: SplitSink<tokio_postgres::CopyBothDuplex<bytes::Bytes>, bytes::Bytes>,
|
||||
}
|
||||
|
||||
pub struct PagestreamReceiver {
|
||||
#[allow(dead_code)]
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
stream: SplitStream<tokio_postgres::CopyBothDuplex<bytes::Bytes>>,
|
||||
}
|
||||
|
||||
enum PagestreamShared {
|
||||
ConnTaskRunning(ConnTaskRunning),
|
||||
ConnTaskCancelledJoinHandleReturnedOrDropped,
|
||||
}
|
||||
struct ConnTaskRunning {
|
||||
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
@@ -146,11 +110,11 @@ pub struct RelTagBlockNo {
|
||||
impl PagestreamClient {
|
||||
pub async fn shutdown(self) {
|
||||
let Self {
|
||||
shared,
|
||||
sink,
|
||||
stream,
|
||||
} = { self };
|
||||
// The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
copy_both,
|
||||
cancel_on_client_drop: cancel_conn_task,
|
||||
conn_task,
|
||||
} = self;
|
||||
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
|
||||
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
|
||||
//
|
||||
@@ -167,77 +131,27 @@ impl PagestreamClient {
|
||||
//
|
||||
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
|
||||
// => https://github.com/neondatabase/neon/issues/6390
|
||||
let ConnTaskRunning {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
} = {
|
||||
let mut guard = shared.lock().unwrap();
|
||||
match std::mem::replace(
|
||||
&mut *guard,
|
||||
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped,
|
||||
) {
|
||||
PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running,
|
||||
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(),
|
||||
}
|
||||
};
|
||||
let _ = cancel_on_client_drop.unwrap();
|
||||
let _ = cancel_conn_task.unwrap();
|
||||
conn_task.await.unwrap();
|
||||
|
||||
// Now drop the split copy_both.
|
||||
drop(sink);
|
||||
drop(stream);
|
||||
}
|
||||
|
||||
pub fn split(self) -> (PagestreamSender, PagestreamReceiver) {
|
||||
let Self {
|
||||
shared: _,
|
||||
sink,
|
||||
stream,
|
||||
} = self;
|
||||
(sink, stream)
|
||||
drop(copy_both);
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
&mut self,
|
||||
req: PagestreamGetPageRequest,
|
||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
self.getpage_send(req).await?;
|
||||
self.getpage_recv().await
|
||||
}
|
||||
let req = PagestreamFeMessage::GetPage(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
let mut req = tokio_stream::once(Ok(req));
|
||||
|
||||
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.sink.getpage_send(req).await
|
||||
}
|
||||
self.copy_both.send_all(&mut req).await?;
|
||||
|
||||
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
self.stream.getpage_recv().await
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamSender {
|
||||
// TODO: maybe make this impl Sink instead for better composability?
|
||||
pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> {
|
||||
let msg = msg.serialize();
|
||||
self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.send(PagestreamFeMessage::GetPage(req)).await
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamReceiver {
|
||||
// TODO: maybe make this impl Stream instead for better composability?
|
||||
pub async fn recv(&mut self) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
|
||||
let next: bytes::Bytes = next.unwrap()?;
|
||||
PagestreamBeMessage::deserialize(next)
|
||||
}
|
||||
|
||||
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let next: PagestreamBeMessage = self.recv().await?;
|
||||
match next {
|
||||
let msg = PagestreamBeMessage::deserialize(next)?;
|
||||
match msg {
|
||||
PagestreamBeMessage::GetPage(p) => Ok(p),
|
||||
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
|
||||
PagestreamBeMessage::Exists(_)
|
||||
@@ -246,14 +160,7 @@ impl PagestreamReceiver {
|
||||
| PagestreamBeMessage::GetSlruSegment(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
next.kind()
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamBeMessage::Test(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
next.kind()
|
||||
msg.kind()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,9 +8,11 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
camino.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
humantime.workspace = true
|
||||
itertools.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
remote_storage = { path = "../../libs/remote_storage" }
|
||||
|
||||
@@ -9,7 +9,9 @@ mod index_part;
|
||||
mod key;
|
||||
mod layer_map_analyzer;
|
||||
mod layers;
|
||||
mod page_trace;
|
||||
|
||||
use page_trace::PageTraceCmd;
|
||||
use std::{
|
||||
str::FromStr,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -64,6 +66,7 @@ enum Commands {
|
||||
Layer(LayerCmd),
|
||||
/// Debug print a hex key found from logs
|
||||
Key(key::DescribeKeyCommand),
|
||||
PageTrace(PageTraceCmd),
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
@@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
73
pageserver/ctl/src/page_trace.rs
Normal file
73
pageserver/ctl/src/page_trace.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::BufReader;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use itertools::Itertools as _;
|
||||
use pageserver_api::key::{CompactKey, Key};
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
|
||||
/// Parses a page trace (as emitted by the `page_trace` timeline API), and outputs stats.
|
||||
#[derive(Parser)]
|
||||
pub(crate) struct PageTraceCmd {
|
||||
/// Trace input file.
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
pub(crate) fn main(cmd: &PageTraceCmd) -> anyhow::Result<()> {
|
||||
let mut file = BufReader::new(std::fs::OpenOptions::new().read(true).open(&cmd.path)?);
|
||||
let mut events: Vec<PageTraceEvent> = Vec::new();
|
||||
loop {
|
||||
match bincode::deserialize_from(&mut file) {
|
||||
Ok(event) => events.push(event),
|
||||
Err(err) => {
|
||||
if let bincode::ErrorKind::Io(ref err) = *err {
|
||||
if err.kind() == std::io::ErrorKind::UnexpectedEof {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Err(err.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut reads_by_relation: HashMap<RelTag, i64> = HashMap::new();
|
||||
let mut reads_by_key: HashMap<CompactKey, i64> = HashMap::new();
|
||||
|
||||
for event in events {
|
||||
let key = Key::from_compact(event.key);
|
||||
let reltag = RelTag {
|
||||
spcnode: key.field2,
|
||||
dbnode: key.field3,
|
||||
relnode: key.field4,
|
||||
forknum: key.field5,
|
||||
};
|
||||
|
||||
*reads_by_relation.entry(reltag).or_default() += 1;
|
||||
*reads_by_key.entry(event.key).or_default() += 1;
|
||||
}
|
||||
|
||||
let multi_read_keys = reads_by_key
|
||||
.into_iter()
|
||||
.filter(|(_, count)| *count > 1)
|
||||
.sorted_by_key(|(key, count)| (-*count, *key))
|
||||
.collect_vec();
|
||||
|
||||
println!("Multi-read keys: {}", multi_read_keys.len());
|
||||
for (key, count) in multi_read_keys {
|
||||
println!(" {key}: {count}");
|
||||
}
|
||||
|
||||
let reads_by_relation = reads_by_relation
|
||||
.into_iter()
|
||||
.sorted_by_key(|(rel, count)| (-*count, *rel))
|
||||
.collect_vec();
|
||||
|
||||
println!("Reads by relation:");
|
||||
for (reltag, count) in reads_by_relation {
|
||||
println!(" {reltag}: {count}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,65 +0,0 @@
|
||||
use std::{
|
||||
io::{stdin, stdout, Read, Write},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use clap::Parser;
|
||||
use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
struct Args {
|
||||
connstr: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let Args {
|
||||
connstr,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = Args::parse();
|
||||
let client = pageserver_client::page_service::Client::new(connstr).await?;
|
||||
let client = client.pagestream(tenant_id, timeline_id).await?;
|
||||
let (mut sender, _receiver) = client.split();
|
||||
|
||||
eprintln!("filling the pipe");
|
||||
let mut msg = 0;
|
||||
loop {
|
||||
msg += 1;
|
||||
let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
|
||||
PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(23),
|
||||
not_modified_since: Lsn(23),
|
||||
},
|
||||
batch_key: 42,
|
||||
message: format!("message {}", msg),
|
||||
},
|
||||
));
|
||||
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
|
||||
eprintln!("pipe seems full");
|
||||
break;
|
||||
};
|
||||
let _: () = res?;
|
||||
}
|
||||
|
||||
let n = stdout().write(b"R")?;
|
||||
assert_eq!(n, 1);
|
||||
stdout().flush()?;
|
||||
|
||||
eprintln!("waiting for signal to tell us to exit");
|
||||
|
||||
let mut buf = [0u8; 1];
|
||||
stdin().read_exact(&mut buf)?;
|
||||
|
||||
eprintln!("termination signal received, exiting");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
@@ -27,6 +27,7 @@ use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::LsnLeaseRequest;
|
||||
use pageserver_api::models::OffloadedTimelineInfo;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantConfigPatchRequest;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
@@ -51,7 +52,9 @@ use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeTravelError;
|
||||
use scopeguard::defer;
|
||||
use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -1521,6 +1524,71 @@ async fn timeline_gc_unblocking_handler(
|
||||
block_or_unblock_gc(request, false).await
|
||||
}
|
||||
|
||||
/// Traces GetPage@LSN requests for a timeline, and emits metadata in an efficient binary encoding.
|
||||
/// Use the `pagectl page-trace` command to decode and analyze the output.
|
||||
async fn timeline_page_trace_handler(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let state = get_state(&request);
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let size_limit: usize = parse_query_param(&request, "size_limit_bytes")?.unwrap_or(1024 * 1024);
|
||||
let time_limit_secs: u64 = parse_query_param(&request, "time_limit_secs")?.unwrap_or(5);
|
||||
|
||||
// Convert size limit to event limit based on the serialized size of an event. The event size is
|
||||
// fixed, as the default bincode serializer uses fixed-width integer encoding.
|
||||
let event_size = bincode::serialize(&PageTraceEvent::default())
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?
|
||||
.len();
|
||||
let event_limit = size_limit / event_size;
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
// Install a page trace, unless one is already in progress. We just use a buffered channel,
|
||||
// which may 2x the memory usage in the worst case, but it's still bounded.
|
||||
let (trace_tx, mut trace_rx) = tokio::sync::mpsc::channel(event_limit);
|
||||
let cur = timeline.page_trace.load();
|
||||
let installed = cur.is_none()
|
||||
&& timeline
|
||||
.page_trace
|
||||
.compare_and_swap(cur, Some(Arc::new(trace_tx)))
|
||||
.is_none();
|
||||
if !installed {
|
||||
return Err(ApiError::Conflict("page trace already active".to_string()));
|
||||
}
|
||||
defer!(timeline.page_trace.store(None)); // uninstall on return
|
||||
|
||||
// Collect the trace and return it to the client. We could stream the response, but this is
|
||||
// simple and fine.
|
||||
let mut body = Vec::with_capacity(size_limit);
|
||||
let deadline = Instant::now() + Duration::from_secs(time_limit_secs);
|
||||
|
||||
while body.len() < size_limit {
|
||||
tokio::select! {
|
||||
event = trace_rx.recv() => {
|
||||
let Some(event) = event else {
|
||||
break; // shouldn't happen (sender doesn't close, unless timeline dropped)
|
||||
};
|
||||
bincode::serialize_into(&mut body, &event)
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?;
|
||||
}
|
||||
_ = tokio::time::sleep_until(deadline) => break, // time limit reached
|
||||
_ = cancel.cancelled() => return Err(ApiError::Cancelled),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, "application/octet-stream")
|
||||
.body(hyper::Body::from(body))
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
|
||||
///
|
||||
/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
|
||||
@@ -3479,6 +3547,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
|
||||
|r| api_handler(r, timeline_gc_unblocking_handler),
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/page_trace",
|
||||
|r| api_handler(r, timeline_page_trace_handler),
|
||||
)
|
||||
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
|
||||
api_handler(r, secondary_upload_handler)
|
||||
})
|
||||
|
||||
@@ -278,6 +278,8 @@ async fn import_wal(
|
||||
|
||||
let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
|
||||
|
||||
let shard = vec![*tline.get_shard_identity()];
|
||||
|
||||
while last_lsn <= endpoint {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
|
||||
@@ -314,10 +316,12 @@ async fn import_wal(
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
tline.get_shard_identity(),
|
||||
&shard,
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?;
|
||||
)?
|
||||
.remove(tline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
walingest
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
@@ -411,6 +415,7 @@ pub async fn import_wal_from_tar(
|
||||
let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = start_lsn;
|
||||
let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
|
||||
let shard = vec![*tline.get_shard_identity()];
|
||||
|
||||
// Ingest wal until end_lsn
|
||||
info!("importing wal until {}", end_lsn);
|
||||
@@ -459,10 +464,12 @@ pub async fn import_wal_from_tar(
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
tline.get_shard_identity(),
|
||||
&shard,
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?;
|
||||
)?
|
||||
.remove(tline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
walingest
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
|
||||
@@ -1463,8 +1463,6 @@ pub enum SmgrQueryType {
|
||||
GetPageAtLsn,
|
||||
GetDbSize,
|
||||
GetSlruSegment,
|
||||
#[cfg(feature = "testing")]
|
||||
Test,
|
||||
}
|
||||
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
|
||||
@@ -67,6 +67,7 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::{basebackup, timed_after_cancellation};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -554,12 +555,6 @@ struct BatchedGetPageRequest {
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
struct BatchedTestRequest {
|
||||
req: models::PagestreamTestRequest,
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
enum BatchedFeMessage {
|
||||
Exists {
|
||||
span: Span,
|
||||
@@ -591,12 +586,6 @@ enum BatchedFeMessage {
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
req: models::PagestreamGetSlruSegmentRequest,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
Test {
|
||||
span: Span,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
requests: Vec<BatchedTestRequest>,
|
||||
},
|
||||
RespondError {
|
||||
span: Span,
|
||||
error: BatchedPageStreamError,
|
||||
@@ -617,12 +606,6 @@ impl BatchedFeMessage {
|
||||
page.timer.observe_execution_start(at);
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
BatchedFeMessage::Test { requests, .. } => {
|
||||
for req in requests {
|
||||
req.timer.observe_execution_start(at);
|
||||
}
|
||||
}
|
||||
BatchedFeMessage::RespondError { .. } => {}
|
||||
}
|
||||
}
|
||||
@@ -883,22 +866,6 @@ impl PageServerHandler {
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessage::Test(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_test_request");
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer =
|
||||
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
|
||||
.await?;
|
||||
BatchedFeMessage::Test {
|
||||
span,
|
||||
shard,
|
||||
requests: vec![BatchedTestRequest { req, timer }],
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Some(batched_msg))
|
||||
}
|
||||
@@ -961,46 +928,6 @@ impl PageServerHandler {
|
||||
accum_pages.extend(this_pages);
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
Ok(BatchedFeMessage::Test {
|
||||
shard: accum_shard,
|
||||
requests: accum_requests,
|
||||
..
|
||||
}),
|
||||
BatchedFeMessage::Test {
|
||||
shard: this_shard,
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) if (|| {
|
||||
assert!(this_requests.len() == 1);
|
||||
if accum_requests.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_requests.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
|
||||
!= (this_shard.tenant_shard_id, this_shard.timeline_id)
|
||||
{
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
let this_batch_key = this_requests[0].req.batch_key;
|
||||
let accum_batch_key = accum_requests[0].req.batch_key;
|
||||
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
|
||||
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})() =>
|
||||
{
|
||||
// ok to batch
|
||||
accum_requests.extend(this_requests);
|
||||
Ok(())
|
||||
}
|
||||
// something batched already but this message is unbatchable
|
||||
(_, this_msg) => {
|
||||
// by default, don't continue batching
|
||||
@@ -1127,27 +1054,6 @@ impl PageServerHandler {
|
||||
span,
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
BatchedFeMessage::Test {
|
||||
span,
|
||||
shard,
|
||||
requests,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::test");
|
||||
(
|
||||
{
|
||||
let npages = requests.len();
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_test_request_batch(&shard, requests, ctx)
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
assert_eq!(res.len(), npages);
|
||||
res
|
||||
},
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::RespondError { span, error } => {
|
||||
// We've already decided to respond with an error, so we don't need to
|
||||
// call the handler.
|
||||
@@ -1813,6 +1719,20 @@ impl PageServerHandler {
|
||||
.query_metrics
|
||||
.observe_getpage_batch_start(requests.len());
|
||||
|
||||
// If a page trace is running, submit an event for this request.
|
||||
if let Some(page_trace) = timeline.page_trace.load().as_ref() {
|
||||
let time = SystemTime::now();
|
||||
for batch in &requests {
|
||||
let key = rel_block_to_key(batch.req.rel, batch.req.blkno).to_compact();
|
||||
// Ignore error (trace buffer may be full or tracer may have disconnected).
|
||||
_ = page_trace.try_send(PageTraceEvent {
|
||||
key,
|
||||
effective_lsn,
|
||||
time,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let results = timeline
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
|
||||
@@ -1871,51 +1791,6 @@ impl PageServerHandler {
|
||||
))
|
||||
}
|
||||
|
||||
// NB: this impl mimics what we do for batched getpage requests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_test_request_batch(
|
||||
&mut self,
|
||||
timeline: &Timeline,
|
||||
requests: Vec<BatchedTestRequest>,
|
||||
_ctx: &RequestContext,
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
|
||||
// real requests would do something with the timeline
|
||||
let mut results = Vec::with_capacity(requests.len());
|
||||
for _req in requests.iter() {
|
||||
tokio::task::yield_now().await;
|
||||
|
||||
results.push({
|
||||
if timeline.cancel.is_cancelled() {
|
||||
Err(PageReconstructError::Cancelled)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: avoid creating the new Vec here
|
||||
Vec::from_iter(
|
||||
requests
|
||||
.into_iter()
|
||||
.zip(results.into_iter())
|
||||
.map(|(req, res)| {
|
||||
res.map(|()| {
|
||||
(
|
||||
PagestreamBeMessage::Test(models::PagestreamTestResponse {
|
||||
req: req.req.clone(),
|
||||
}),
|
||||
req.timer,
|
||||
)
|
||||
})
|
||||
.map_err(|e| BatchedPageStreamError {
|
||||
err: PageStreamError::from(e),
|
||||
req: req.req.hdr,
|
||||
})
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
/// Note on "fullbackup":
|
||||
/// Full basebackups should only be used for debugging purposes.
|
||||
/// Originally, it was introduced to enable breaking storage format changes,
|
||||
|
||||
@@ -14,7 +14,7 @@ pub mod uninit;
|
||||
mod walreceiver;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use arc_swap::ArcSwap;
|
||||
use arc_swap::{ArcSwap, ArcSwapOption};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use chrono::{DateTime, Utc};
|
||||
@@ -23,6 +23,7 @@ use fail::fail_point;
|
||||
use handle::ShardTimelineId;
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::PageTraceEvent;
|
||||
use pageserver_api::{
|
||||
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
key::{
|
||||
@@ -42,6 +43,7 @@ use rand::Rng;
|
||||
use remote_storage::DownloadError;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::{
|
||||
runtime::Handle,
|
||||
sync::{oneshot, watch},
|
||||
@@ -433,6 +435,9 @@ pub struct Timeline {
|
||||
|
||||
/// Cf. [`crate::tenant::CreateTimelineIdempotency`].
|
||||
pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
||||
|
||||
/// If Some, collects GetPage metadata for an ongoing PageTrace.
|
||||
pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
|
||||
}
|
||||
|
||||
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
|
||||
@@ -2380,6 +2385,8 @@ impl Timeline {
|
||||
attach_wal_lag_cooldown,
|
||||
|
||||
create_idempotency,
|
||||
|
||||
page_trace: Default::default(),
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
|
||||
@@ -1776,7 +1776,10 @@ impl Timeline {
|
||||
base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
|
||||
) -> anyhow::Result<KeyHistoryRetention> {
|
||||
// Pre-checks for the invariants
|
||||
if cfg!(debug_assertions) {
|
||||
|
||||
let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
|
||||
|
||||
if debug_mode {
|
||||
for (log_key, _, _) in full_history {
|
||||
assert_eq!(log_key, &key, "mismatched key");
|
||||
}
|
||||
@@ -1922,15 +1925,19 @@ impl Timeline {
|
||||
output
|
||||
}
|
||||
|
||||
let mut key_exists = false;
|
||||
for (i, split_for_lsn) in split_history.into_iter().enumerate() {
|
||||
// TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
|
||||
records_since_last_image += split_for_lsn.len();
|
||||
let generate_image = if i == 0 && !has_ancestor {
|
||||
// Whether to produce an image into the final layer files
|
||||
let produce_image = if i == 0 && !has_ancestor {
|
||||
// We always generate images for the first batch (below horizon / lowest retain_lsn)
|
||||
true
|
||||
} else if i == batch_cnt - 1 {
|
||||
// Do not generate images for the last batch (above horizon)
|
||||
false
|
||||
} else if records_since_last_image == 0 {
|
||||
false
|
||||
} else if records_since_last_image >= delta_threshold_cnt {
|
||||
// Generate images when there are too many records
|
||||
true
|
||||
@@ -1945,29 +1952,45 @@ impl Timeline {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if let Some((_, _, val)) = replay_history.first() {
|
||||
if !val.will_init() {
|
||||
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
|
||||
|| {
|
||||
generate_debug_trace(
|
||||
Some(&replay_history),
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
)
|
||||
},
|
||||
);
|
||||
}
|
||||
if replay_history.is_empty() && !key_exists {
|
||||
// The key does not exist at earlier LSN, we can skip this iteration.
|
||||
retention.push(Vec::new());
|
||||
continue;
|
||||
} else {
|
||||
key_exists = true;
|
||||
}
|
||||
if generate_image && records_since_last_image > 0 {
|
||||
let Some((_, _, val)) = replay_history.first() else {
|
||||
unreachable!("replay history should not be empty once it exists")
|
||||
};
|
||||
if !val.will_init() {
|
||||
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(|| {
|
||||
generate_debug_trace(
|
||||
Some(&replay_history),
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
)
|
||||
});
|
||||
}
|
||||
// Whether to reconstruct the image. In debug mode, we will generate an image
|
||||
// at every retain_lsn to ensure data is not corrupted, but we won't put the
|
||||
// image into the final layer.
|
||||
let generate_image = produce_image || debug_mode;
|
||||
if produce_image {
|
||||
records_since_last_image = 0;
|
||||
let replay_history_for_debug = if cfg!(debug_assertions) {
|
||||
}
|
||||
let img_and_lsn = if generate_image {
|
||||
let replay_history_for_debug = if debug_mode {
|
||||
Some(replay_history.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
|
||||
let history = std::mem::take(&mut replay_history);
|
||||
let history = if produce_image {
|
||||
std::mem::take(&mut replay_history)
|
||||
} else {
|
||||
replay_history.clone()
|
||||
};
|
||||
let mut img = None;
|
||||
let mut records = Vec::with_capacity(history.len());
|
||||
if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
|
||||
@@ -2004,8 +2027,20 @@ impl Timeline {
|
||||
}
|
||||
records.reverse();
|
||||
let state = ValueReconstructState { img, records };
|
||||
let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
|
||||
// last batch does not generate image so i is always in range, unless we force generate
|
||||
// an image during testing
|
||||
let request_lsn = if i >= lsn_split_points.len() {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
lsn_split_points[i]
|
||||
};
|
||||
let img = self.reconstruct_value(key, request_lsn, state).await?;
|
||||
Some((request_lsn, img))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
if produce_image {
|
||||
let (request_lsn, img) = img_and_lsn.unwrap();
|
||||
replay_history.push((key, request_lsn, Value::Image(img.clone())));
|
||||
retention.push(vec![(request_lsn, Value::Image(img))]);
|
||||
} else {
|
||||
@@ -2273,6 +2308,8 @@ impl Timeline {
|
||||
let compact_key_range = job.compact_key_range;
|
||||
let compact_lsn_range = job.compact_lsn_range;
|
||||
|
||||
let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
|
||||
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);
|
||||
|
||||
scopeguard::defer! {
|
||||
@@ -2398,7 +2435,7 @@ impl Timeline {
|
||||
.first()
|
||||
.copied()
|
||||
.unwrap_or(job_desc.gc_cutoff);
|
||||
if cfg!(debug_assertions) {
|
||||
if debug_mode {
|
||||
assert_eq!(
|
||||
res,
|
||||
job_desc
|
||||
|
||||
@@ -264,6 +264,8 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
|
||||
|
||||
let shard = vec![*timeline.get_shard_identity()];
|
||||
|
||||
let interpreted_proto_config = match protocol {
|
||||
PostgresClientProtocol::Vanilla => None,
|
||||
PostgresClientProtocol::Interpreted {
|
||||
@@ -476,10 +478,12 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
modification.tline.get_shard_identity(),
|
||||
&shard,
|
||||
next_record_lsn,
|
||||
modification.tline.pg_version,
|
||||
)?;
|
||||
)?
|
||||
.remove(timeline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
|
||||
@@ -2163,10 +2163,12 @@ mod tests {
|
||||
while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
modification.tline.get_shard_identity(),
|
||||
&[*modification.tline.get_shard_identity()],
|
||||
lsn,
|
||||
modification.tline.pg_version,
|
||||
)
|
||||
.unwrap()
|
||||
.remove(modification.tline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
walingest
|
||||
|
||||
@@ -911,7 +911,74 @@ pageserver_receive(shardno_t shard_no)
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
else if (rc == -1)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect(shard_no);
|
||||
resp = NULL;
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
|
||||
shard->nresponses_received++;
|
||||
return (NeonResponse *) resp;
|
||||
}
|
||||
|
||||
static NeonResponse *
|
||||
pageserver_try_receive(shardno_t shard_no)
|
||||
{
|
||||
StringInfoData resp_buff;
|
||||
NeonResponse *resp;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn = shard->conn;
|
||||
/* read response */
|
||||
int rc;
|
||||
|
||||
if (shard->state != PS_Connected)
|
||||
return NULL;
|
||||
|
||||
Assert(pageserver_conn);
|
||||
|
||||
rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async = true */);
|
||||
|
||||
if (rc == 0)
|
||||
return NULL;
|
||||
else if (rc > 0)
|
||||
{
|
||||
PG_TRY();
|
||||
{
|
||||
resp_buff.len = rc;
|
||||
resp_buff.cursor = 0;
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
@@ -980,6 +1047,7 @@ page_server_api api =
|
||||
.send = pageserver_send,
|
||||
.flush = pageserver_flush,
|
||||
.receive = pageserver_receive,
|
||||
.try_receive = pageserver_try_receive,
|
||||
.disconnect = pageserver_disconnect_shard
|
||||
};
|
||||
|
||||
|
||||
@@ -192,9 +192,29 @@ typedef uint16 shardno_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/*
|
||||
* Send this request to the PageServer associated with this shard.
|
||||
*/
|
||||
bool (*send) (shardno_t shard_no, NeonRequest * request);
|
||||
/*
|
||||
* Blocking read for the next response of this shard.
|
||||
*
|
||||
* When a CANCEL signal is handled, the connection state will be
|
||||
* unmodified.
|
||||
*/
|
||||
NeonResponse *(*receive) (shardno_t shard_no);
|
||||
/*
|
||||
* Try get the next response from the TCP buffers, if any.
|
||||
* Returns NULL when the data is not yet available.
|
||||
*/
|
||||
NeonResponse *(*try_receive) (shardno_t shard_no);
|
||||
/*
|
||||
* Make sure all requests are sent to PageServer.
|
||||
*/
|
||||
bool (*flush) (shardno_t shard_no);
|
||||
/*
|
||||
* Disconnect from this pageserver shard.
|
||||
*/
|
||||
void (*disconnect) (shardno_t shard_no);
|
||||
} page_server_api;
|
||||
|
||||
|
||||
@@ -405,6 +405,56 @@ compact_prefetch_buffers(void)
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there might be responses still in the TCP buffer, then
|
||||
* we should try to use those, so as to reduce any TCP backpressure
|
||||
* on the OS/PS side.
|
||||
*
|
||||
* This procedure handles that.
|
||||
*
|
||||
* Note that this is only valid as long as the only pipelined
|
||||
* operations in the TCP buffer are getPage@Lsn requests.
|
||||
*/
|
||||
static void
|
||||
prefetch_pump_state(void)
|
||||
{
|
||||
while (MyPState->ring_receive != MyPState->ring_flush)
|
||||
{
|
||||
NeonResponse *response;
|
||||
PrefetchRequest *slot;
|
||||
MemoryContext old;
|
||||
|
||||
slot = GetPrfSlot(MyPState->ring_receive);
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = page_server->try_receive(slot->shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
|
||||
if (response == NULL)
|
||||
break;
|
||||
|
||||
/* The slot should still be valid */
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(slot->shard_no, ERROR,
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
|
||||
slot->status, slot->response,
|
||||
(long) slot->my_ring_index, (long) MyPState->ring_receive);
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_responses_buffered += 1;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
MyPState->ring_receive += 1;
|
||||
MyNeonCounters->getpage_prefetches_buffered =
|
||||
MyPState->n_responses_buffered;
|
||||
|
||||
/* update slot state */
|
||||
slot->status = PRFS_RECEIVED;
|
||||
slot->response = response;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
readahead_buffer_resize(int newsize, void *extra)
|
||||
{
|
||||
@@ -2808,6 +2858,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
MyPState->ring_last <= ring_index);
|
||||
}
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2849,6 +2901,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
Assert(ring_index < MyPState->ring_unused &&
|
||||
MyPState->ring_last <= ring_index);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
return false;
|
||||
}
|
||||
#endif /* PG_MAJORVERSION_NUM < 17 */
|
||||
@@ -2891,6 +2945,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
*/
|
||||
neon_log(SmgrTrace, "writeback noop");
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdwriteback(reln, forknum, blocknum, nblocks);
|
||||
@@ -3145,6 +3201,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL);
|
||||
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
{
|
||||
@@ -3282,6 +3340,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
|
||||
buffers, nblocks, read);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
{
|
||||
@@ -3450,6 +3510,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
|
||||
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
@@ -3503,6 +3565,8 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
|
||||
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
|
||||
@@ -3792,6 +3856,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
|
||||
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
|
||||
|
||||
prefetch_pump_state();
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdimmedsync(reln, forknum);
|
||||
|
||||
@@ -26,6 +26,7 @@ hex.workspace = true
|
||||
humantime.workspace = true
|
||||
http.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
futures.workspace = true
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
@@ -39,6 +40,7 @@ scopeguard.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
smallvec.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
thiserror.workspace = true
|
||||
@@ -63,6 +65,7 @@ storage_broker.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
utils.workspace = true
|
||||
wal_decoder.workspace = true
|
||||
env_logger.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
@@ -207,6 +207,13 @@ struct Args {
|
||||
/// Also defines interval for eviction retries.
|
||||
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
|
||||
eviction_min_resident: Duration,
|
||||
/// Enable fanning out WAL to different shards from the same reader
|
||||
#[arg(long)]
|
||||
wal_reader_fanout: bool,
|
||||
/// Only fan out the WAL reader if the absoulte delta between the new requested position
|
||||
/// and the current position of the reader is smaller than this value.
|
||||
#[arg(long)]
|
||||
max_delta_for_fanout: Option<u64>,
|
||||
}
|
||||
|
||||
// Like PathBufValueParser, but allows empty string.
|
||||
@@ -370,6 +377,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
control_file_save_interval: args.control_file_save_interval,
|
||||
partial_backup_concurrency: args.partial_backup_concurrency,
|
||||
eviction_min_resident: args.eviction_min_resident,
|
||||
wal_reader_fanout: args.wal_reader_fanout,
|
||||
max_delta_for_fanout: args.max_delta_for_fanout,
|
||||
});
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use safekeeper_api::membership::INVALID_GENERATION;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use utils::crashsafe::durable_rename;
|
||||
@@ -13,14 +14,14 @@ use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::control_file_upgrade::downgrade_v9_to_v8;
|
||||
use crate::control_file_upgrade::downgrade_v10_to_v9;
|
||||
use crate::control_file_upgrade::upgrade_control_file;
|
||||
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
|
||||
use crate::state::{EvictionState, TimelinePersistentState};
|
||||
use utils::bin_ser::LeSer;
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 9;
|
||||
pub const SK_FORMAT_VERSION: u32 = 10;
|
||||
|
||||
// contains persistent metadata for safekeeper
|
||||
pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
|
||||
@@ -169,10 +170,11 @@ impl TimelinePersistentState {
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
|
||||
|
||||
if self.eviction_state == EvictionState::Present {
|
||||
// temp hack for forward compatibility
|
||||
const PREV_FORMAT_VERSION: u32 = 8;
|
||||
let prev = downgrade_v9_to_v8(self);
|
||||
if self.mconf.generation == INVALID_GENERATION {
|
||||
// Temp hack for forward compatibility test: in case of none
|
||||
// configuration save cfile in previous v9 format.
|
||||
const PREV_FORMAT_VERSION: u32 = 9;
|
||||
let prev = downgrade_v10_to_v9(self);
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
|
||||
prev.ser_into(&mut buf)?;
|
||||
} else {
|
||||
@@ -233,6 +235,7 @@ impl Storage for FileStorage {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use safekeeper_api::membership::{Configuration, MemberSet};
|
||||
use tokio::fs;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -242,6 +245,11 @@ mod test {
|
||||
async fn test_read_write_safekeeper_state() -> anyhow::Result<()> {
|
||||
let tempdir = camino_tempfile::tempdir()?;
|
||||
let mut state = TimelinePersistentState::empty();
|
||||
state.mconf = Configuration {
|
||||
generation: 42,
|
||||
members: MemberSet::empty(),
|
||||
new_members: None,
|
||||
};
|
||||
let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;
|
||||
|
||||
// Make a change.
|
||||
|
||||
@@ -1,17 +1,22 @@
|
||||
//! Code to deal with safekeeper control file upgrades
|
||||
use std::vec;
|
||||
|
||||
use crate::{
|
||||
safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn},
|
||||
state::{EvictionState, PersistedPeers, TimelinePersistentState},
|
||||
state::{EvictionState, TimelinePersistentState},
|
||||
wal_backup_partial,
|
||||
};
|
||||
use anyhow::{bail, Result};
|
||||
use pq_proto::SystemId;
|
||||
use safekeeper_api::{ServerInfo, Term};
|
||||
use safekeeper_api::{
|
||||
membership::{Configuration, INVALID_GENERATION},
|
||||
ServerInfo, Term,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::LeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
@@ -233,6 +238,90 @@ pub struct SafeKeeperStateV8 {
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeerInfo {
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
pub backup_lsn: Lsn,
|
||||
/// Term of the last entry.
|
||||
pub term: Term,
|
||||
/// LSN of the last record.
|
||||
pub flush_lsn: Lsn,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
pub commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl PersistedPeerInfo {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
backup_lsn: Lsn::INVALID,
|
||||
term: safekeeper_api::INITIAL_TERM,
|
||||
flush_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// make clippy happy
|
||||
impl Default for PersistedPeerInfo {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Note: SafekeeperStateVn is old name for TimelinePersistentStateVn.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct TimelinePersistentStateV9 {
|
||||
#[serde(with = "hex")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(with = "hex")]
|
||||
pub timeline_id: TimelineId,
|
||||
/// persistent acceptor state
|
||||
pub acceptor_state: AcceptorState,
|
||||
/// information about server
|
||||
pub server: ServerInfo,
|
||||
/// Unique id of the last *elected* proposer we dealt with. Not needed
|
||||
/// for correctness, exists for monitoring purposes.
|
||||
#[serde(with = "hex")]
|
||||
pub proposer_uuid: PgUuid,
|
||||
/// Since which LSN this timeline generally starts. Safekeeper might have
|
||||
/// joined later.
|
||||
pub timeline_start_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has (had) WAL for this timeline.
|
||||
/// All WAL segments next to one containing local_start_lsn are
|
||||
/// filled with data from the beginning.
|
||||
pub local_start_lsn: Lsn,
|
||||
/// Part of WAL acknowledged by quorum *and available locally*. Always points
|
||||
/// to record boundary.
|
||||
pub commit_lsn: Lsn,
|
||||
/// LSN that points to the end of the last backed up segment. Useful to
|
||||
/// persist to avoid finding out offloading progress on boot.
|
||||
pub backup_lsn: Lsn,
|
||||
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
||||
/// of last record streamed to everyone). Persisting it helps skipping
|
||||
/// recovery in walproposer, generally we compute it from peers. In
|
||||
/// walproposer proto called 'truncate_lsn'. Updates are currently drived
|
||||
/// only by walproposer.
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
/// LSN of the oldest known checkpoint made by pageserver and successfully
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
/// informational purposes, we receive it from pageserver (or broker).
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
/// Peers and their state as we remember it. Knowing peers themselves is
|
||||
/// fundamental; but state is saved here only for informational purposes and
|
||||
/// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
/// place to have less file version upgrades).
|
||||
pub peers: PersistedPeers,
|
||||
/// Holds names of partial segments uploaded to remote storage. Used to
|
||||
/// clean up old objects without leaving garbage in remote storage.
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
/// Eviction state of the timeline. If it's Offloaded, we should download
|
||||
/// WAL files from remote storage to serve the timeline.
|
||||
pub eviction_state: EvictionState,
|
||||
}
|
||||
|
||||
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
|
||||
// migrate to storing full term history
|
||||
if version == 1 {
|
||||
@@ -248,6 +337,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.server.tenant_id,
|
||||
timeline_id: oldstate.server.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: ac,
|
||||
server: ServerInfo {
|
||||
pg_version: oldstate.server.pg_version,
|
||||
@@ -261,9 +351,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
// migrate to hexing some ids
|
||||
} else if version == 2 {
|
||||
@@ -277,6 +367,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.server.tenant_id,
|
||||
timeline_id: oldstate.server.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -286,9 +377,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
// migrate to moving tenant_id/timeline_id to the top and adding some lsns
|
||||
} else if version == 3 {
|
||||
@@ -302,6 +393,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.server.tenant_id,
|
||||
timeline_id: oldstate.server.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -311,9 +403,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
// migrate to having timeline_start_lsn
|
||||
} else if version == 4 {
|
||||
@@ -327,6 +419,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -336,9 +429,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn::INVALID,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
} else if version == 5 {
|
||||
info!("reading safekeeper control file version {}", version);
|
||||
@@ -372,6 +465,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -381,9 +475,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
peers: oldstate.peers,
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
} else if version == 8 {
|
||||
let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
|
||||
@@ -391,6 +485,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -400,9 +495,28 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
peers: oldstate.peers,
|
||||
partial_backup: oldstate.partial_backup,
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
} else if version == 9 {
|
||||
let oldstate = TimelinePersistentStateV9::des(&buf[..buf.len()])?;
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
timeline_start_lsn: oldstate.timeline_start_lsn,
|
||||
local_start_lsn: oldstate.local_start_lsn,
|
||||
commit_lsn: oldstate.commit_lsn,
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
partial_backup: oldstate.partial_backup,
|
||||
eviction_state: oldstate.eviction_state,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -412,9 +526,11 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
bail!("unsupported safekeeper control file version {}", version)
|
||||
}
|
||||
|
||||
pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
|
||||
assert!(state.eviction_state == EvictionState::Present);
|
||||
SafeKeeperStateV8 {
|
||||
// Used as a temp hack to make forward compatibility test work. Should be
|
||||
// removed after PR adding v10 is merged.
|
||||
pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersistentStateV9 {
|
||||
assert!(state.mconf.generation == INVALID_GENERATION);
|
||||
TimelinePersistentStateV9 {
|
||||
tenant_id: state.tenant_id,
|
||||
timeline_id: state.timeline_id,
|
||||
acceptor_state: state.acceptor_state.clone(),
|
||||
@@ -426,8 +542,9 @@ pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8
|
||||
backup_lsn: state.backup_lsn,
|
||||
peer_horizon_lsn: state.peer_horizon_lsn,
|
||||
remote_consistent_lsn: state.remote_consistent_lsn,
|
||||
peers: state.peers.clone(),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: state.partial_backup.clone(),
|
||||
eviction_state: state.eviction_state,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -437,7 +554,7 @@ mod tests {
|
||||
|
||||
use utils::{id::NodeId, Hex};
|
||||
|
||||
use crate::safekeeper::PersistedPeerInfo;
|
||||
use crate::control_file_upgrade::PersistedPeerInfo;
|
||||
|
||||
use super::*;
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use anyhow::{bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use std::sync::Arc;
|
||||
use tokio::{
|
||||
fs::OpenOptions,
|
||||
@@ -147,10 +148,10 @@ pub async fn handle_request(
|
||||
|
||||
let mut new_state = TimelinePersistentState::new(
|
||||
&request.destination_ttid,
|
||||
Configuration::empty(),
|
||||
state.server.clone(),
|
||||
vec![],
|
||||
request.until_lsn,
|
||||
start_lsn,
|
||||
request.until_lsn,
|
||||
)?;
|
||||
new_state.timeline_start_lsn = start_lsn;
|
||||
new_state.peer_horizon_lsn = request.until_lsn;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use safekeeper_api::models;
|
||||
use safekeeper_api::models::AcceptorStateStatus;
|
||||
use safekeeper_api::models::SafekeeperStatus;
|
||||
use safekeeper_api::models::TermSwitchApiEntry;
|
||||
@@ -111,14 +112,15 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
system_id: request_data.system_id.unwrap_or(0),
|
||||
wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
|
||||
};
|
||||
let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
|
||||
request_data
|
||||
.commit_lsn
|
||||
.segment_lsn(server_info.wal_seg_size as usize)
|
||||
});
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
global_timelines
|
||||
.create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
|
||||
.create(
|
||||
ttid,
|
||||
request_data.mconf,
|
||||
server_info,
|
||||
request_data.start_lsn,
|
||||
request_data.commit_lsn.unwrap_or(request_data.start_lsn),
|
||||
)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -182,6 +184,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let status = TimelineStatus {
|
||||
tenant_id: ttid.tenant_id,
|
||||
timeline_id: ttid.timeline_id,
|
||||
mconf: state.mconf,
|
||||
acceptor_state: acc_state,
|
||||
pg_info: state.server,
|
||||
flush_lsn,
|
||||
@@ -192,7 +195,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
peer_horizon_lsn: inmem.peer_horizon_lsn,
|
||||
remote_consistent_lsn: inmem.remote_consistent_lsn,
|
||||
peers: tli.get_peers(conf).await,
|
||||
walsenders: tli.get_walsenders().get_all(),
|
||||
walsenders: tli.get_walsenders().get_all_public(),
|
||||
walreceivers: tli.get_walreceivers().get_all(),
|
||||
};
|
||||
json_response(StatusCode::OK, status)
|
||||
@@ -267,6 +270,28 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Consider switching timeline membership configuration to the provided one.
|
||||
async fn timeline_membership_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?;
|
||||
let response = tli
|
||||
.membership_switch(data.mconf)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
@@ -618,6 +643,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
|
||||
|r| request_span(r, timeline_snapshot_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/membership",
|
||||
|r| request_span(r, timeline_membership_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
|
||||
|r| request_span(r, timeline_copy_handler),
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
use anyhow::Context;
|
||||
use postgres_backend::QueryError;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::{ServerInfo, Term};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -105,6 +106,7 @@ async fn prepare_safekeeper(
|
||||
.global_timelines
|
||||
.create(
|
||||
spg.ttid,
|
||||
Configuration::empty(),
|
||||
ServerInfo {
|
||||
pg_version,
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
|
||||
@@ -108,6 +108,8 @@ pub struct SafeKeeperConf {
|
||||
pub control_file_save_interval: Duration,
|
||||
pub partial_backup_concurrency: usize,
|
||||
pub eviction_min_resident: Duration,
|
||||
pub wal_reader_fanout: bool,
|
||||
pub max_delta_for_fanout: Option<u64>,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -150,6 +152,8 @@ impl SafeKeeperConf {
|
||||
control_file_save_interval: Duration::from_secs(1),
|
||||
partial_backup_concurrency: 1,
|
||||
eviction_min_resident: Duration::ZERO,
|
||||
wal_reader_fanout: false,
|
||||
max_delta_for_fanout: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,9 +12,9 @@ use metrics::{
|
||||
pow2_buckets,
|
||||
proto::MetricFamily,
|
||||
register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, GaugeVec,
|
||||
Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
|
||||
IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
|
||||
register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
|
||||
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_ffi::XLogSegNo;
|
||||
@@ -211,6 +211,14 @@ pub static WAL_RECEIVERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
)
|
||||
.expect("Failed to register safekeeper_wal_receivers")
|
||||
});
|
||||
pub static WAL_READERS: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"safekeeper_wal_readers",
|
||||
"Number of active WAL readers (may serve pageservers or other safekeepers)",
|
||||
&["kind", "target"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_wal_receivers")
|
||||
});
|
||||
pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy<Histogram> = Lazy::new(|| {
|
||||
// Use powers of two buckets, but add a bucket at 0 and the max queue size to track empty and
|
||||
// full queues respectively.
|
||||
@@ -443,6 +451,7 @@ pub struct FullTimelineInfo {
|
||||
pub timeline_is_active: bool,
|
||||
pub num_computes: u32,
|
||||
pub last_removed_segno: XLogSegNo,
|
||||
pub interpreted_wal_reader_tasks: usize,
|
||||
|
||||
pub epoch_start_lsn: Lsn,
|
||||
pub mem_state: TimelineMemState,
|
||||
@@ -472,6 +481,7 @@ pub struct TimelineCollector {
|
||||
disk_usage: GenericGaugeVec<AtomicU64>,
|
||||
acceptor_term: GenericGaugeVec<AtomicU64>,
|
||||
written_wal_bytes: GenericGaugeVec<AtomicU64>,
|
||||
interpreted_wal_reader_tasks: GenericGaugeVec<AtomicU64>,
|
||||
written_wal_seconds: GaugeVec,
|
||||
flushed_wal_seconds: GaugeVec,
|
||||
collect_timeline_metrics: Gauge,
|
||||
@@ -670,6 +680,16 @@ impl TimelineCollector {
|
||||
.unwrap();
|
||||
descs.extend(active_timelines_count.desc().into_iter().cloned());
|
||||
|
||||
let interpreted_wal_reader_tasks = GenericGaugeVec::new(
|
||||
Opts::new(
|
||||
"safekeeper_interpreted_wal_reader_tasks",
|
||||
"Number of active interpreted wal reader tasks, grouped by timeline",
|
||||
),
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.unwrap();
|
||||
descs.extend(interpreted_wal_reader_tasks.desc().into_iter().cloned());
|
||||
|
||||
TimelineCollector {
|
||||
global_timelines,
|
||||
descs,
|
||||
@@ -693,6 +713,7 @@ impl TimelineCollector {
|
||||
collect_timeline_metrics,
|
||||
timelines_count,
|
||||
active_timelines_count,
|
||||
interpreted_wal_reader_tasks,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -721,6 +742,7 @@ impl Collector for TimelineCollector {
|
||||
self.disk_usage.reset();
|
||||
self.acceptor_term.reset();
|
||||
self.written_wal_bytes.reset();
|
||||
self.interpreted_wal_reader_tasks.reset();
|
||||
self.written_wal_seconds.reset();
|
||||
self.flushed_wal_seconds.reset();
|
||||
|
||||
@@ -782,6 +804,9 @@ impl Collector for TimelineCollector {
|
||||
self.written_wal_bytes
|
||||
.with_label_values(labels)
|
||||
.set(tli.wal_storage.write_wal_bytes);
|
||||
self.interpreted_wal_reader_tasks
|
||||
.with_label_values(labels)
|
||||
.set(tli.interpreted_wal_reader_tasks as u64);
|
||||
self.written_wal_seconds
|
||||
.with_label_values(labels)
|
||||
.set(tli.wal_storage.write_wal_seconds);
|
||||
@@ -834,6 +859,7 @@ impl Collector for TimelineCollector {
|
||||
mfs.extend(self.disk_usage.collect());
|
||||
mfs.extend(self.acceptor_term.collect());
|
||||
mfs.extend(self.written_wal_bytes.collect());
|
||||
mfs.extend(self.interpreted_wal_reader_tasks.collect());
|
||||
mfs.extend(self.written_wal_seconds.collect());
|
||||
mfs.extend(self.flushed_wal_seconds.collect());
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::PostgresBackendReader;
|
||||
use postgres_backend::QueryError;
|
||||
use pq_proto::BeMessage;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus};
|
||||
use safekeeper_api::ServerInfo;
|
||||
use std::future;
|
||||
@@ -337,7 +338,13 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
};
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
|
||||
.create(
|
||||
self.ttid,
|
||||
Configuration::empty(),
|
||||
server_info,
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await
|
||||
.context("create timeline")?;
|
||||
tli.wal_residence_guard().await?
|
||||
|
||||
@@ -7,7 +7,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
|
||||
use safekeeper_api::models::HotStandbyFeedback;
|
||||
use safekeeper_api::Term;
|
||||
use safekeeper_api::INVALID_TERM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::max;
|
||||
use std::cmp::min;
|
||||
@@ -193,36 +192,6 @@ impl AcceptorState {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeerInfo {
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
pub backup_lsn: Lsn,
|
||||
/// Term of the last entry.
|
||||
pub term: Term,
|
||||
/// LSN of the last record.
|
||||
pub flush_lsn: Lsn,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
pub commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl PersistedPeerInfo {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
backup_lsn: Lsn::INVALID,
|
||||
term: INVALID_TERM,
|
||||
flush_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// make clippy happy
|
||||
impl Default for PersistedPeerInfo {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
// protocol messages
|
||||
|
||||
/// Initial Proposer -> Acceptor message
|
||||
@@ -1010,7 +979,7 @@ where
|
||||
|
||||
/// Update commit_lsn from peer safekeeper data.
|
||||
pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
|
||||
if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
|
||||
if Lsn(sk_info.commit_lsn) != Lsn::INVALID {
|
||||
// Note: the check is too restrictive, generally we can update local
|
||||
// commit_lsn if our history matches (is part of) history of advanced
|
||||
// commit_lsn provider.
|
||||
@@ -1025,12 +994,20 @@ where
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::future::BoxFuture;
|
||||
|
||||
use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
|
||||
use safekeeper_api::ServerInfo;
|
||||
use safekeeper_api::{
|
||||
membership::{Configuration, MemberSet, SafekeeperId},
|
||||
ServerInfo,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState};
|
||||
use std::{ops::Deref, str::FromStr, time::Instant};
|
||||
use crate::state::{EvictionState, TimelinePersistentState};
|
||||
use std::{
|
||||
ops::Deref,
|
||||
str::FromStr,
|
||||
time::{Instant, UNIX_EPOCH},
|
||||
};
|
||||
|
||||
// fake storage for tests
|
||||
struct InMemoryState {
|
||||
@@ -1313,12 +1290,21 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_sk_state_bincode_serde_roundtrip() {
|
||||
use utils::Hex;
|
||||
let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
|
||||
let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
|
||||
let state = TimelinePersistentState {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
mconf: Configuration {
|
||||
generation: 42,
|
||||
members: MemberSet::new(vec![SafekeeperId {
|
||||
id: NodeId(1),
|
||||
host: "hehe.org".to_owned(),
|
||||
pg_port: 5432,
|
||||
}])
|
||||
.expect("duplicate member"),
|
||||
new_members: None,
|
||||
},
|
||||
acceptor_state: AcceptorState {
|
||||
term: 42,
|
||||
term_history: TermHistory(vec![TermLsn {
|
||||
@@ -1342,70 +1328,13 @@ mod tests {
|
||||
backup_lsn: Lsn(1234567300),
|
||||
peer_horizon_lsn: Lsn(9999999),
|
||||
remote_consistent_lsn: Lsn(1234560000),
|
||||
peers: PersistedPeers(vec![(
|
||||
NodeId(1),
|
||||
PersistedPeerInfo {
|
||||
backup_lsn: Lsn(1234567000),
|
||||
term: 42,
|
||||
flush_lsn: Lsn(1234567800 - 8),
|
||||
commit_lsn: Lsn(1234567600),
|
||||
},
|
||||
)]),
|
||||
partial_backup: crate::wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: UNIX_EPOCH,
|
||||
};
|
||||
|
||||
let ser = state.ser().unwrap();
|
||||
|
||||
#[rustfmt::skip]
|
||||
let expected = [
|
||||
// tenant_id as length prefixed hex
|
||||
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36,
|
||||
// timeline_id as length prefixed hex
|
||||
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34,
|
||||
// term
|
||||
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// length prefix
|
||||
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// unsure why this order is swapped
|
||||
0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// pg_version
|
||||
0x0e, 0x00, 0x00, 0x00,
|
||||
// systemid
|
||||
0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
|
||||
// wal_seg_size
|
||||
0x78, 0x56, 0x34, 0x12,
|
||||
// pguuid as length prefixed hex
|
||||
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31,
|
||||
|
||||
// timeline_start_lsn
|
||||
0x00, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00,
|
||||
0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
0x84, 0x00, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0xe4, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
// length prefix for persistentpeers
|
||||
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// nodeid
|
||||
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// backuplsn
|
||||
0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
// partial_backup
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// eviction_state
|
||||
0x00, 0x00, 0x00, 0x00,
|
||||
];
|
||||
|
||||
assert_eq!(Hex(&ser), Hex(&expected));
|
||||
|
||||
let deser = TimelinePersistentState::des(&ser).unwrap();
|
||||
|
||||
assert_eq!(deser, state);
|
||||
|
||||
@@ -1,96 +1,330 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{anyhow, Context};
|
||||
use futures::future::Either;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend};
|
||||
use postgres_ffi::MAX_SEND_SIZE;
|
||||
use postgres_ffi::waldecoder::WalDecodeError;
|
||||
use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
|
||||
use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::sync::mpsc::error::SendError;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::MissedTickBehavior;
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::Compression;
|
||||
use utils::postgres_client::InterpretedFormat;
|
||||
use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords};
|
||||
use wal_decoder::wire_format::ToWireFormat;
|
||||
|
||||
use crate::send_wal::EndWatchView;
|
||||
use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
|
||||
use crate::metrics::WAL_READERS;
|
||||
use crate::send_wal::{EndWatchView, WalSenderGuard};
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::wal_reader_stream::{StreamingWalReader, WalBytes};
|
||||
|
||||
/// Shard-aware interpreted record sender.
|
||||
/// This is used for sending WAL to the pageserver. Said WAL
|
||||
/// is pre-interpreted and filtered for the shard.
|
||||
pub(crate) struct InterpretedWalSender<'a, IO> {
|
||||
pub(crate) format: InterpretedFormat,
|
||||
pub(crate) compression: Option<Compression>,
|
||||
pub(crate) pgb: &'a mut PostgresBackend<IO>,
|
||||
pub(crate) wal_stream_builder: WalReaderStreamBuilder,
|
||||
pub(crate) end_watch_view: EndWatchView,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) pg_version: u32,
|
||||
pub(crate) appname: Option<String>,
|
||||
/// Identifier used to differentiate between senders of the same
|
||||
/// shard.
|
||||
///
|
||||
/// In the steady state there's only one, but two pageservers may
|
||||
/// temporarily have the same shard attached and attempt to ingest
|
||||
/// WAL for it. See also [`ShardSenderId`].
|
||||
#[derive(Hash, Eq, PartialEq, Copy, Clone)]
|
||||
struct SenderId(u8);
|
||||
|
||||
impl SenderId {
|
||||
fn first() -> Self {
|
||||
SenderId(0)
|
||||
}
|
||||
|
||||
fn next(&self) -> Self {
|
||||
SenderId(self.0.checked_add(1).expect("few senders"))
|
||||
}
|
||||
}
|
||||
|
||||
struct Batch {
|
||||
#[derive(Hash, Eq, PartialEq)]
|
||||
struct ShardSenderId {
|
||||
shard: ShardIdentity,
|
||||
sender_id: SenderId,
|
||||
}
|
||||
|
||||
impl Display for ShardSenderId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}{}", self.sender_id.0, self.shard.shard_slug())
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardSenderId {
|
||||
fn new(shard: ShardIdentity, sender_id: SenderId) -> Self {
|
||||
ShardSenderId { shard, sender_id }
|
||||
}
|
||||
|
||||
fn shard(&self) -> ShardIdentity {
|
||||
self.shard
|
||||
}
|
||||
}
|
||||
|
||||
/// Shard-aware fan-out interpreted record reader.
|
||||
/// Reads WAL from disk, decodes it, intepretets it, and sends
|
||||
/// it to any [`InterpretedWalSender`] connected to it.
|
||||
/// Each [`InterpretedWalSender`] corresponds to one shard
|
||||
/// and gets interpreted records concerning that shard only.
|
||||
pub(crate) struct InterpretedWalReader {
|
||||
wal_stream: StreamingWalReader,
|
||||
shard_senders: HashMap<ShardIdentity, smallvec::SmallVec<[ShardSenderState; 1]>>,
|
||||
shard_notification_rx: Option<tokio::sync::mpsc::UnboundedReceiver<AttachShardNotification>>,
|
||||
state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
/// A handle for [`InterpretedWalReader`] which allows for interacting with it
|
||||
/// when it runs as a separate tokio task.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct InterpretedWalReaderHandle {
|
||||
join_handle: JoinHandle<Result<(), InterpretedWalReaderError>>,
|
||||
state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
|
||||
shard_notification_tx: tokio::sync::mpsc::UnboundedSender<AttachShardNotification>,
|
||||
}
|
||||
|
||||
struct ShardSenderState {
|
||||
sender_id: SenderId,
|
||||
tx: tokio::sync::mpsc::Sender<Batch>,
|
||||
next_record_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// State of [`InterpretedWalReader`] visible outside of the task running it.
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum InterpretedWalReaderState {
|
||||
Running { current_position: Lsn },
|
||||
Done,
|
||||
}
|
||||
|
||||
pub(crate) struct Batch {
|
||||
wal_end_lsn: Lsn,
|
||||
available_wal_end_lsn: Lsn,
|
||||
records: InterpretedWalRecords,
|
||||
}
|
||||
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
/// Send interpreted WAL to a receiver.
|
||||
/// Stops when an error occurs or the receiver is caught up and there's no active compute.
|
||||
///
|
||||
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
|
||||
/// convenience.
|
||||
pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut wal_position = self.wal_stream_builder.start_pos();
|
||||
let mut wal_decoder =
|
||||
WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version);
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum InterpretedWalReaderError {
|
||||
/// Handler initiates the end of streaming.
|
||||
#[error("decode error: {0}")]
|
||||
Decode(#[from] WalDecodeError),
|
||||
#[error("read or interpret error: {0}")]
|
||||
ReadOrInterpret(#[from] anyhow::Error),
|
||||
#[error("wal stream closed")]
|
||||
WalStreamClosed,
|
||||
}
|
||||
|
||||
let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?;
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
impl InterpretedWalReaderState {
|
||||
fn current_position(&self) -> Option<Lsn> {
|
||||
match self {
|
||||
InterpretedWalReaderState::Running {
|
||||
current_position, ..
|
||||
} => Some(*current_position),
|
||||
InterpretedWalReaderState::Done => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
|
||||
keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
|
||||
keepalive_ticker.reset();
|
||||
pub(crate) struct AttachShardNotification {
|
||||
shard_id: ShardIdentity,
|
||||
sender: tokio::sync::mpsc::Sender<Batch>,
|
||||
start_pos: Lsn,
|
||||
}
|
||||
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
|
||||
impl InterpretedWalReader {
|
||||
/// Spawn the reader in a separate tokio task and return a handle
|
||||
pub(crate) fn spawn(
|
||||
wal_stream: StreamingWalReader,
|
||||
start_pos: Lsn,
|
||||
tx: tokio::sync::mpsc::Sender<Batch>,
|
||||
shard: ShardIdentity,
|
||||
pg_version: u32,
|
||||
appname: &Option<String>,
|
||||
) -> InterpretedWalReaderHandle {
|
||||
let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
|
||||
current_position: start_pos,
|
||||
}));
|
||||
|
||||
let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let reader = InterpretedWalReader {
|
||||
wal_stream,
|
||||
shard_senders: HashMap::from([(
|
||||
shard,
|
||||
smallvec::smallvec![ShardSenderState {
|
||||
sender_id: SenderId::first(),
|
||||
tx,
|
||||
next_record_lsn: start_pos,
|
||||
}],
|
||||
)]),
|
||||
shard_notification_rx: Some(shard_notification_rx),
|
||||
state: state.clone(),
|
||||
pg_version,
|
||||
};
|
||||
|
||||
let metric = WAL_READERS
|
||||
.get_metric_with_label_values(&["task", appname.as_deref().unwrap_or("safekeeper")])
|
||||
.unwrap();
|
||||
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
metric.inc();
|
||||
scopeguard::defer! {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
let res = reader.run_impl(start_pos).await;
|
||||
if let Err(ref err) = res {
|
||||
tracing::error!("Task finished with error: {err}");
|
||||
}
|
||||
res
|
||||
}
|
||||
.instrument(info_span!("interpreted wal reader")),
|
||||
);
|
||||
|
||||
InterpretedWalReaderHandle {
|
||||
join_handle,
|
||||
state,
|
||||
shard_notification_tx,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct the reader without spawning anything
|
||||
/// Callers should drive the future returned by [`Self::run`].
|
||||
pub(crate) fn new(
|
||||
wal_stream: StreamingWalReader,
|
||||
start_pos: Lsn,
|
||||
tx: tokio::sync::mpsc::Sender<Batch>,
|
||||
shard: ShardIdentity,
|
||||
pg_version: u32,
|
||||
) -> InterpretedWalReader {
|
||||
let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
|
||||
current_position: start_pos,
|
||||
}));
|
||||
|
||||
InterpretedWalReader {
|
||||
wal_stream,
|
||||
shard_senders: HashMap::from([(
|
||||
shard,
|
||||
smallvec::smallvec![ShardSenderState {
|
||||
sender_id: SenderId::first(),
|
||||
tx,
|
||||
next_record_lsn: start_pos,
|
||||
}],
|
||||
)]),
|
||||
shard_notification_rx: None,
|
||||
state: state.clone(),
|
||||
pg_version,
|
||||
}
|
||||
}
|
||||
|
||||
/// Entry point for future (polling) based wal reader.
|
||||
pub(crate) async fn run(
|
||||
self,
|
||||
start_pos: Lsn,
|
||||
appname: &Option<String>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let metric = WAL_READERS
|
||||
.get_metric_with_label_values(&["future", appname.as_deref().unwrap_or("safekeeper")])
|
||||
.unwrap();
|
||||
|
||||
metric.inc();
|
||||
scopeguard::defer! {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
let res = self.run_impl(start_pos).await;
|
||||
if let Err(err) = res {
|
||||
tracing::error!("Interpreted wal reader encountered error: {err}");
|
||||
} else {
|
||||
tracing::info!("Interpreted wal reader exiting");
|
||||
}
|
||||
|
||||
Err(CopyStreamHandlerEnd::Other(anyhow!(
|
||||
"interpreted wal reader finished"
|
||||
)))
|
||||
}
|
||||
|
||||
/// Send interpreted WAL to one or more [`InterpretedWalSender`]s
|
||||
/// Stops when an error is encountered or when the [`InterpretedWalReaderHandle`]
|
||||
/// goes out of scope.
|
||||
async fn run_impl(mut self, start_pos: Lsn) -> Result<(), InterpretedWalReaderError> {
|
||||
let defer_state = self.state.clone();
|
||||
scopeguard::defer! {
|
||||
*defer_state.write().unwrap() = InterpretedWalReaderState::Done;
|
||||
}
|
||||
|
||||
let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Get some WAL from the stream and then: decode, interpret and push it down the
|
||||
// pipeline.
|
||||
wal = stream.next(), if tx.capacity() > 0 => {
|
||||
let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal {
|
||||
Some(some) => some?,
|
||||
None => { break; }
|
||||
// Main branch for reading WAL and forwarding it
|
||||
wal_or_reset = self.wal_stream.next() => {
|
||||
let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below"));
|
||||
let WalBytes {
|
||||
wal,
|
||||
wal_start_lsn: _,
|
||||
wal_end_lsn,
|
||||
available_wal_end_lsn,
|
||||
} = match wal {
|
||||
Some(some) => some.map_err(InterpretedWalReaderError::ReadOrInterpret)?,
|
||||
None => {
|
||||
// [`StreamingWalReader::next`] is an endless stream of WAL.
|
||||
// It shouldn't ever finish unless it panicked or became internally
|
||||
// inconsistent.
|
||||
return Result::Err(InterpretedWalReaderError::WalStreamClosed);
|
||||
}
|
||||
};
|
||||
|
||||
wal_position = wal_end_lsn;
|
||||
wal_decoder.feed_bytes(&wal);
|
||||
|
||||
let mut records = Vec::new();
|
||||
// Deserialize and interpret WAL records from this batch of WAL.
|
||||
// Interpreted records for each shard are collected separately.
|
||||
let shard_ids = self.shard_senders.keys().copied().collect::<Vec<_>>();
|
||||
let mut records_by_sender: HashMap<ShardSenderId, Vec<InterpretedWalRecord>> = HashMap::new();
|
||||
let mut max_next_record_lsn = None;
|
||||
while let Some((next_record_lsn, recdata)) = wal_decoder
|
||||
.poll_decode()
|
||||
.with_context(|| "Failed to decode WAL")?
|
||||
while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()?
|
||||
{
|
||||
assert!(next_record_lsn.is_aligned());
|
||||
max_next_record_lsn = Some(next_record_lsn);
|
||||
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&self.shard,
|
||||
&shard_ids,
|
||||
next_record_lsn,
|
||||
self.pg_version,
|
||||
)
|
||||
.with_context(|| "Failed to interpret WAL")?;
|
||||
|
||||
if !interpreted.is_empty() {
|
||||
records.push(interpreted);
|
||||
for (shard, record) in interpreted {
|
||||
if record.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut states_iter = self.shard_senders
|
||||
.get(&shard)
|
||||
.expect("keys collected above")
|
||||
.iter()
|
||||
.filter(|state| record.next_record_lsn > state.next_record_lsn)
|
||||
.peekable();
|
||||
while let Some(state) = states_iter.next() {
|
||||
let shard_sender_id = ShardSenderId::new(shard, state.sender_id);
|
||||
|
||||
// The most commont case is one sender per shard. Peek and break to avoid the
|
||||
// clone in that situation.
|
||||
if states_iter.peek().is_none() {
|
||||
records_by_sender.entry(shard_sender_id).or_default().push(record);
|
||||
break;
|
||||
} else {
|
||||
records_by_sender.entry(shard_sender_id).or_default().push(record.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,20 +333,170 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
None => { continue; }
|
||||
};
|
||||
|
||||
let batch = InterpretedWalRecords {
|
||||
records,
|
||||
next_record_lsn: Some(max_next_record_lsn),
|
||||
};
|
||||
// Update the current position such that new receivers can decide
|
||||
// whether to attach to us or spawn a new WAL reader.
|
||||
match &mut *self.state.write().unwrap() {
|
||||
InterpretedWalReaderState::Running { current_position, .. } => {
|
||||
*current_position = max_next_record_lsn;
|
||||
},
|
||||
InterpretedWalReaderState::Done => {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
||||
tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap();
|
||||
// Send interpreted records downstream. Anything that has already been seen
|
||||
// by a shard is filtered out.
|
||||
let mut shard_senders_to_remove = Vec::new();
|
||||
for (shard, states) in &mut self.shard_senders {
|
||||
for state in states {
|
||||
if max_next_record_lsn <= state.next_record_lsn {
|
||||
continue;
|
||||
}
|
||||
|
||||
let shard_sender_id = ShardSenderId::new(*shard, state.sender_id);
|
||||
let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
|
||||
|
||||
let batch = InterpretedWalRecords {
|
||||
records,
|
||||
next_record_lsn: Some(max_next_record_lsn),
|
||||
};
|
||||
|
||||
let res = state.tx.send(Batch {
|
||||
wal_end_lsn,
|
||||
available_wal_end_lsn,
|
||||
records: batch,
|
||||
}).await;
|
||||
|
||||
if res.is_err() {
|
||||
shard_senders_to_remove.push(shard_sender_id);
|
||||
} else {
|
||||
state.next_record_lsn = max_next_record_lsn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up any shard senders that have dropped out.
|
||||
// This is inefficient, but such events are rare (connection to PS termination)
|
||||
// and the number of subscriptions on the same shards very small (only one
|
||||
// for the steady state).
|
||||
for to_remove in shard_senders_to_remove {
|
||||
let shard_senders = self.shard_senders.get_mut(&to_remove.shard()).expect("saw it above");
|
||||
if let Some(idx) = shard_senders.iter().position(|s| s.sender_id == to_remove.sender_id) {
|
||||
shard_senders.remove(idx);
|
||||
tracing::info!("Removed shard sender {}", to_remove);
|
||||
}
|
||||
|
||||
if shard_senders.is_empty() {
|
||||
self.shard_senders.remove(&to_remove.shard());
|
||||
}
|
||||
}
|
||||
},
|
||||
// For a previously interpreted batch, serialize it and push it down the wire.
|
||||
batch = rx.recv() => {
|
||||
// Listen for new shards that want to attach to this reader.
|
||||
// If the reader is not running as a task, then this is not supported
|
||||
// (see the pending branch below).
|
||||
notification = match self.shard_notification_rx.as_mut() {
|
||||
Some(rx) => Either::Left(rx.recv()),
|
||||
None => Either::Right(std::future::pending())
|
||||
} => {
|
||||
if let Some(n) = notification {
|
||||
let AttachShardNotification { shard_id, sender, start_pos } = n;
|
||||
|
||||
// Update internal and external state, then reset the WAL stream
|
||||
// if required.
|
||||
let senders = self.shard_senders.entry(shard_id).or_default();
|
||||
let new_sender_id = match senders.last() {
|
||||
Some(sender) => sender.sender_id.next(),
|
||||
None => SenderId::first()
|
||||
};
|
||||
|
||||
senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
|
||||
let current_pos = self.state.read().unwrap().current_position().unwrap();
|
||||
if start_pos < current_pos {
|
||||
self.wal_stream.reset(start_pos).await;
|
||||
wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"Added shard sender {} with start_pos={} current_pos={}",
|
||||
ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl InterpretedWalReaderHandle {
|
||||
/// Fan-out the reader by attaching a new shard to it
|
||||
pub(crate) fn fanout(
|
||||
&self,
|
||||
shard_id: ShardIdentity,
|
||||
sender: tokio::sync::mpsc::Sender<Batch>,
|
||||
start_pos: Lsn,
|
||||
) -> Result<(), SendError<AttachShardNotification>> {
|
||||
self.shard_notification_tx.send(AttachShardNotification {
|
||||
shard_id,
|
||||
sender,
|
||||
start_pos,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the current WAL position of the reader
|
||||
pub(crate) fn current_position(&self) -> Option<Lsn> {
|
||||
self.state.read().unwrap().current_position()
|
||||
}
|
||||
|
||||
pub(crate) fn abort(&self) {
|
||||
self.join_handle.abort()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for InterpretedWalReaderHandle {
|
||||
fn drop(&mut self) {
|
||||
tracing::info!("Aborting interpreted wal reader");
|
||||
self.abort()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct InterpretedWalSender<'a, IO> {
|
||||
pub(crate) format: InterpretedFormat,
|
||||
pub(crate) compression: Option<Compression>,
|
||||
pub(crate) appname: Option<String>,
|
||||
|
||||
pub(crate) tli: WalResidentTimeline,
|
||||
pub(crate) start_lsn: Lsn,
|
||||
|
||||
pub(crate) pgb: &'a mut PostgresBackend<IO>,
|
||||
pub(crate) end_watch_view: EndWatchView,
|
||||
pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
|
||||
pub(crate) rx: tokio::sync::mpsc::Receiver<Batch>,
|
||||
}
|
||||
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
/// Send interpreted WAL records over the network.
|
||||
/// Also manages keep-alives if nothing was sent for a while.
|
||||
pub(crate) async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
|
||||
keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
|
||||
keepalive_ticker.reset();
|
||||
|
||||
let mut wal_position = self.start_lsn;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
batch = self.rx.recv() => {
|
||||
let batch = match batch {
|
||||
Some(b) => b,
|
||||
None => { break; }
|
||||
None => {
|
||||
return Result::Err(
|
||||
CopyStreamHandlerEnd::Other(anyhow!("Interpreted WAL reader exited early"))
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
wal_position = batch.wal_end_lsn;
|
||||
|
||||
let buf = batch
|
||||
.records
|
||||
.to_wire(self.format, self.compression)
|
||||
@@ -132,7 +516,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
})).await?;
|
||||
}
|
||||
// Send a periodic keep alive when the connection has been idle for a while.
|
||||
// Since we've been idle, also check if we can stop streaming.
|
||||
_ = keepalive_ticker.tick() => {
|
||||
if let Some(remote_consistent_lsn) = self.wal_sender_guard
|
||||
.walsenders()
|
||||
.get_ws_remote_consistent_lsn(self.wal_sender_guard.id())
|
||||
{
|
||||
if self.tli.should_walsender_stop(remote_consistent_lsn).await {
|
||||
// Stop streaming if the receivers are caught up and
|
||||
// there's no active compute. This causes the loop in
|
||||
// [`crate::send_interpreted_wal::InterpretedWalSender::run`]
|
||||
// to exit and terminate the WAL stream.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
self.pgb
|
||||
.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
|
||||
wal_end: self.end_watch_view.get().0,
|
||||
@@ -140,14 +538,259 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
request_reply: true,
|
||||
}))
|
||||
.await?;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// The loop above ends when the receiver is caught up and there's no more WAL to send.
|
||||
Err(CopyStreamHandlerEnd::ServerInitiated(format!(
|
||||
"ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
|
||||
self.appname, wal_position,
|
||||
)))
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
|
||||
use postgres_ffi::MAX_SEND_SIZE;
|
||||
use tokio::sync::mpsc::error::TryRecvError;
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
send_interpreted_wal::{Batch, InterpretedWalReader},
|
||||
test_utils::Env,
|
||||
wal_reader_stream::StreamingWalReader,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_interpreted_wal_reader_fanout() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const MSG_COUNT: usize = 200;
|
||||
const PG_VERSION: u32 = 17;
|
||||
const SHARD_COUNT: u8 = 2;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
tracing::info!("Doing first round of reads ...");
|
||||
|
||||
let streaming_wal_reader = StreamingWalReader::new(
|
||||
resident_tli,
|
||||
None,
|
||||
start_lsn,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let shard_0 = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let shard_1 = ShardIdentity::new(
|
||||
ShardNumber(1),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut shards = HashMap::new();
|
||||
|
||||
for shard_number in 0..SHARD_COUNT {
|
||||
let shard_id = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
shards.insert(shard_id, (Some(tx), Some(rx)));
|
||||
}
|
||||
|
||||
let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap();
|
||||
let mut shard_0_rx = shards.get_mut(&shard_0).unwrap().1.take().unwrap();
|
||||
|
||||
let handle = InterpretedWalReader::spawn(
|
||||
streaming_wal_reader,
|
||||
start_lsn,
|
||||
shard_0_tx,
|
||||
shard_0,
|
||||
PG_VERSION,
|
||||
&Some("pageserver".to_string()),
|
||||
);
|
||||
|
||||
tracing::info!("Reading all WAL with only shard 0 attached ...");
|
||||
|
||||
let mut shard_0_interpreted_records = Vec::new();
|
||||
while let Some(batch) = shard_0_rx.recv().await {
|
||||
shard_0_interpreted_records.push(batch.records);
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let shard_1_tx = shards.get_mut(&shard_1).unwrap().0.take().unwrap();
|
||||
let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap();
|
||||
|
||||
tracing::info!("Attaching shard 1 to the reader at start of WAL");
|
||||
handle.fanout(shard_1, shard_1_tx, start_lsn).unwrap();
|
||||
|
||||
tracing::info!("Reading all WAL with shard 0 and shard 1 attached ...");
|
||||
|
||||
let mut shard_1_interpreted_records = Vec::new();
|
||||
while let Some(batch) = shard_1_rx.recv().await {
|
||||
shard_1_interpreted_records.push(batch.records);
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// This test uses logical messages. Those only go to shard 0. Check that the
|
||||
// filtering worked and shard 1 did not get any.
|
||||
assert!(shard_1_interpreted_records
|
||||
.iter()
|
||||
.all(|recs| recs.records.is_empty()));
|
||||
|
||||
// Shard 0 should not receive anything more since the reader is
|
||||
// going through wal that it has already processed.
|
||||
let res = shard_0_rx.try_recv();
|
||||
if let Ok(ref ok) = res {
|
||||
tracing::error!(
|
||||
"Shard 0 received batch: wal_end_lsn={} available_wal_end_lsn={}",
|
||||
ok.wal_end_lsn,
|
||||
ok.available_wal_end_lsn
|
||||
);
|
||||
}
|
||||
assert!(matches!(res, Err(TryRecvError::Empty)));
|
||||
|
||||
// Check that the next records lsns received by the two shards match up.
|
||||
let shard_0_next_lsns = shard_0_interpreted_records
|
||||
.iter()
|
||||
.map(|recs| recs.next_record_lsn)
|
||||
.collect::<Vec<_>>();
|
||||
let shard_1_next_lsns = shard_1_interpreted_records
|
||||
.iter()
|
||||
.map(|recs| recs.next_record_lsn)
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(shard_0_next_lsns, shard_1_next_lsns);
|
||||
|
||||
handle.abort();
|
||||
let mut done = false;
|
||||
for _ in 0..5 {
|
||||
if handle.current_position().is_none() {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(1)).await;
|
||||
}
|
||||
|
||||
assert!(done);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_interpreted_wal_reader_same_shard_fanout() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const MSG_COUNT: usize = 200;
|
||||
const PG_VERSION: u32 = 17;
|
||||
const SHARD_COUNT: u8 = 2;
|
||||
const ATTACHED_SHARDS: u8 = 4;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
let streaming_wal_reader = StreamingWalReader::new(
|
||||
resident_tli,
|
||||
None,
|
||||
start_lsn,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let shard_0 = ShardIdentity::new(
|
||||
ShardNumber(0),
|
||||
ShardCount(SHARD_COUNT),
|
||||
ShardStripeSize::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
let mut batch_receivers = vec![rx];
|
||||
|
||||
let handle = InterpretedWalReader::spawn(
|
||||
streaming_wal_reader,
|
||||
start_lsn,
|
||||
tx,
|
||||
shard_0,
|
||||
PG_VERSION,
|
||||
&Some("pageserver".to_string()),
|
||||
);
|
||||
|
||||
for _ in 0..(ATTACHED_SHARDS - 1) {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
handle.fanout(shard_0, tx, start_lsn).unwrap();
|
||||
batch_receivers.push(rx);
|
||||
}
|
||||
|
||||
loop {
|
||||
let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
|
||||
for rx in batch_receivers.iter_mut().skip(1) {
|
||||
let other_batch = rx.recv().await.unwrap();
|
||||
|
||||
assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
|
||||
assert_eq!(
|
||||
batch.available_wal_end_lsn,
|
||||
other_batch.available_wal_end_lsn
|
||||
);
|
||||
}
|
||||
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
handle.abort();
|
||||
let mut done = false;
|
||||
for _ in 0..5 {
|
||||
if handle.current_position().is_none() {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(1)).await;
|
||||
}
|
||||
|
||||
assert!(done);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,16 +2,18 @@
|
||||
//! with the "START_REPLICATION" message, and registry of walsenders.
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::metrics::RECEIVED_PS_FEEDBACKS;
|
||||
use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS};
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::TermLsn;
|
||||
use crate::send_interpreted_wal::InterpretedWalSender;
|
||||
use crate::send_interpreted_wal::{
|
||||
Batch, InterpretedWalReader, InterpretedWalReaderHandle, InterpretedWalSender,
|
||||
};
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::wal_reader_stream::WalReaderStreamBuilder;
|
||||
use crate::wal_reader_stream::StreamingWalReader;
|
||||
use crate::wal_storage::WalReader;
|
||||
use anyhow::{bail, Context as AnyhowContext};
|
||||
use bytes::Bytes;
|
||||
use futures::future::Either;
|
||||
use futures::FutureExt;
|
||||
use parking_lot::Mutex;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
|
||||
@@ -19,16 +21,16 @@ use postgres_ffi::get_current_timestamp;
|
||||
use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
|
||||
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
|
||||
use safekeeper_api::models::{
|
||||
ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
|
||||
WalSenderState, INVALID_FULL_TRANSACTION_ID,
|
||||
HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
|
||||
INVALID_FULL_TRANSACTION_ID,
|
||||
};
|
||||
use safekeeper_api::Term;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use utils::failpoint_support;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use itertools::Itertools;
|
||||
use std::cmp::{max, min};
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
@@ -50,6 +52,12 @@ pub struct WalSenders {
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
}
|
||||
|
||||
pub struct WalSendersTimelineMetricValues {
|
||||
pub ps_feedback_counter: u64,
|
||||
pub last_ps_feedback: PageserverFeedback,
|
||||
pub interpreted_wal_reader_tasks: usize,
|
||||
}
|
||||
|
||||
impl WalSenders {
|
||||
pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
|
||||
Arc::new(WalSenders {
|
||||
@@ -60,21 +68,8 @@ impl WalSenders {
|
||||
|
||||
/// Register new walsender. Returned guard provides access to the slot and
|
||||
/// automatically deregisters in Drop.
|
||||
fn register(
|
||||
self: &Arc<WalSenders>,
|
||||
ttid: TenantTimelineId,
|
||||
addr: SocketAddr,
|
||||
conn_id: ConnectionId,
|
||||
appname: Option<String>,
|
||||
) -> WalSenderGuard {
|
||||
fn register(self: &Arc<WalSenders>, walsender_state: WalSenderState) -> WalSenderGuard {
|
||||
let slots = &mut self.mutex.lock().slots;
|
||||
let walsender_state = WalSenderState {
|
||||
ttid,
|
||||
addr,
|
||||
conn_id,
|
||||
appname,
|
||||
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
|
||||
};
|
||||
// find empty slot or create new one
|
||||
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
|
||||
slots[pos] = Some(walsender_state);
|
||||
@@ -90,9 +85,79 @@ impl WalSenders {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_or_update_interpreted_reader<
|
||||
FUp: FnOnce(&Arc<InterpretedWalReaderHandle>) -> anyhow::Result<()>,
|
||||
FNew: FnOnce() -> InterpretedWalReaderHandle,
|
||||
>(
|
||||
self: &Arc<WalSenders>,
|
||||
id: WalSenderId,
|
||||
start_pos: Lsn,
|
||||
max_delta_for_fanout: Option<u64>,
|
||||
update: FUp,
|
||||
create: FNew,
|
||||
) -> anyhow::Result<()> {
|
||||
let state = &mut self.mutex.lock();
|
||||
|
||||
let mut selected_interpreted_reader = None;
|
||||
for slot in state.slots.iter().flatten() {
|
||||
if let WalSenderState::Interpreted(slot_state) = slot {
|
||||
if let Some(ref interpreted_reader) = slot_state.interpreted_wal_reader {
|
||||
let select = match (interpreted_reader.current_position(), max_delta_for_fanout)
|
||||
{
|
||||
(Some(pos), Some(max_delta)) => {
|
||||
let delta = pos.0.abs_diff(start_pos.0);
|
||||
delta <= max_delta
|
||||
}
|
||||
// Reader is not active
|
||||
(None, _) => false,
|
||||
// Gating fanout by max delta is disabled.
|
||||
// Attach to any active reader.
|
||||
(_, None) => true,
|
||||
};
|
||||
|
||||
if select {
|
||||
selected_interpreted_reader = Some(interpreted_reader.clone());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let slot = state.get_slot_mut(id);
|
||||
let slot_state = match slot {
|
||||
WalSenderState::Interpreted(s) => s,
|
||||
WalSenderState::Vanilla(_) => unreachable!(),
|
||||
};
|
||||
|
||||
let selected_or_new = match selected_interpreted_reader {
|
||||
Some(selected) => {
|
||||
update(&selected)?;
|
||||
selected
|
||||
}
|
||||
None => Arc::new(create()),
|
||||
};
|
||||
|
||||
slot_state.interpreted_wal_reader = Some(selected_or_new);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get state of all walsenders.
|
||||
pub fn get_all(self: &Arc<WalSenders>) -> Vec<WalSenderState> {
|
||||
self.mutex.lock().slots.iter().flatten().cloned().collect()
|
||||
pub fn get_all_public(self: &Arc<WalSenders>) -> Vec<safekeeper_api::models::WalSenderState> {
|
||||
self.mutex
|
||||
.lock()
|
||||
.slots
|
||||
.iter()
|
||||
.flatten()
|
||||
.map(|state| match state {
|
||||
WalSenderState::Vanilla(s) => {
|
||||
safekeeper_api::models::WalSenderState::Vanilla(s.clone())
|
||||
}
|
||||
WalSenderState::Interpreted(s) => {
|
||||
safekeeper_api::models::WalSenderState::Interpreted(s.public_state.clone())
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get LSN of the most lagging pageserver receiver. Return None if there are no
|
||||
@@ -103,7 +168,7 @@ impl WalSenders {
|
||||
.slots
|
||||
.iter()
|
||||
.flatten()
|
||||
.filter_map(|s| match s.feedback {
|
||||
.filter_map(|s| match s.get_feedback() {
|
||||
ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn),
|
||||
ReplicationFeedback::Standby(_) => None,
|
||||
})
|
||||
@@ -111,9 +176,25 @@ impl WalSenders {
|
||||
}
|
||||
|
||||
/// Returns total counter of pageserver feedbacks received and last feedback.
|
||||
pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
|
||||
pub fn info_for_metrics(self: &Arc<WalSenders>) -> WalSendersTimelineMetricValues {
|
||||
let shared = self.mutex.lock();
|
||||
(shared.ps_feedback_counter, shared.last_ps_feedback)
|
||||
|
||||
let interpreted_wal_reader_tasks = shared
|
||||
.slots
|
||||
.iter()
|
||||
.filter_map(|ss| match ss {
|
||||
Some(WalSenderState::Interpreted(int)) => int.interpreted_wal_reader.as_ref(),
|
||||
Some(WalSenderState::Vanilla(_)) => None,
|
||||
None => None,
|
||||
})
|
||||
.unique_by(|reader| Arc::as_ptr(reader))
|
||||
.count();
|
||||
|
||||
WalSendersTimelineMetricValues {
|
||||
ps_feedback_counter: shared.ps_feedback_counter,
|
||||
last_ps_feedback: shared.last_ps_feedback,
|
||||
interpreted_wal_reader_tasks,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get aggregated hot standby feedback (we send it to compute).
|
||||
@@ -124,7 +205,7 @@ impl WalSenders {
|
||||
/// Record new pageserver feedback, update aggregated values.
|
||||
fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
|
||||
let mut shared = self.mutex.lock();
|
||||
shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
|
||||
*shared.get_slot_mut(id).get_mut_feedback() = ReplicationFeedback::Pageserver(*feedback);
|
||||
shared.last_ps_feedback = *feedback;
|
||||
shared.ps_feedback_counter += 1;
|
||||
drop(shared);
|
||||
@@ -143,10 +224,10 @@ impl WalSenders {
|
||||
"Record standby reply: ts={} apply_lsn={}",
|
||||
reply.reply_ts, reply.apply_lsn
|
||||
);
|
||||
match &mut slot.feedback {
|
||||
match &mut slot.get_mut_feedback() {
|
||||
ReplicationFeedback::Standby(sf) => sf.reply = *reply,
|
||||
ReplicationFeedback::Pageserver(_) => {
|
||||
slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
*slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
reply: *reply,
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
})
|
||||
@@ -158,10 +239,10 @@ impl WalSenders {
|
||||
fn record_hs_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &HotStandbyFeedback) {
|
||||
let mut shared = self.mutex.lock();
|
||||
let slot = shared.get_slot_mut(id);
|
||||
match &mut slot.feedback {
|
||||
match &mut slot.get_mut_feedback() {
|
||||
ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback,
|
||||
ReplicationFeedback::Pageserver(_) => {
|
||||
slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
*slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
|
||||
reply: StandbyReply::empty(),
|
||||
hs_feedback: *feedback,
|
||||
})
|
||||
@@ -175,7 +256,7 @@ impl WalSenders {
|
||||
pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
|
||||
let shared = self.mutex.lock();
|
||||
let slot = shared.get_slot(id);
|
||||
match slot.feedback {
|
||||
match slot.get_feedback() {
|
||||
ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn),
|
||||
_ => None,
|
||||
}
|
||||
@@ -199,6 +280,47 @@ struct WalSendersShared {
|
||||
slots: Vec<Option<WalSenderState>>,
|
||||
}
|
||||
|
||||
/// Safekeeper internal definitions of wal sender state
|
||||
///
|
||||
/// As opposed to [`safekeeper_api::models::WalSenderState`] these struct may
|
||||
/// include state that we don not wish to expose to the public api.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) enum WalSenderState {
|
||||
Vanilla(VanillaWalSenderInternalState),
|
||||
Interpreted(InterpretedWalSenderInternalState),
|
||||
}
|
||||
|
||||
type VanillaWalSenderInternalState = safekeeper_api::models::VanillaWalSenderState;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct InterpretedWalSenderInternalState {
|
||||
public_state: safekeeper_api::models::InterpretedWalSenderState,
|
||||
interpreted_wal_reader: Option<Arc<InterpretedWalReaderHandle>>,
|
||||
}
|
||||
|
||||
impl WalSenderState {
|
||||
fn get_addr(&self) -> &SocketAddr {
|
||||
match self {
|
||||
WalSenderState::Vanilla(state) => &state.addr,
|
||||
WalSenderState::Interpreted(state) => &state.public_state.addr,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_feedback(&self) -> &ReplicationFeedback {
|
||||
match self {
|
||||
WalSenderState::Vanilla(state) => &state.feedback,
|
||||
WalSenderState::Interpreted(state) => &state.public_state.feedback,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mut_feedback(&mut self) -> &mut ReplicationFeedback {
|
||||
match self {
|
||||
WalSenderState::Vanilla(state) => &mut state.feedback,
|
||||
WalSenderState::Interpreted(state) => &mut state.public_state.feedback,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl WalSendersShared {
|
||||
fn new() -> Self {
|
||||
WalSendersShared {
|
||||
@@ -225,7 +347,7 @@ impl WalSendersShared {
|
||||
let mut agg = HotStandbyFeedback::empty();
|
||||
let mut reply_agg = StandbyReply::empty();
|
||||
for ws_state in self.slots.iter().flatten() {
|
||||
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
|
||||
if let ReplicationFeedback::Standby(standby_feedback) = ws_state.get_feedback() {
|
||||
let hs_feedback = standby_feedback.hs_feedback;
|
||||
// doing Option math like op1.iter().chain(op2.iter()).min()
|
||||
// would be nicer, but we serialize/deserialize this struct
|
||||
@@ -317,7 +439,7 @@ impl SafekeeperPostgresHandler {
|
||||
/// Wrapper around handle_start_replication_guts handling result. Error is
|
||||
/// handled here while we're still in walsender ttid span; with API
|
||||
/// extension, this can probably be moved into postgres_backend.
|
||||
pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
start_pos: Lsn,
|
||||
@@ -342,7 +464,7 @@ impl SafekeeperPostgresHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
start_pos: Lsn,
|
||||
@@ -352,12 +474,30 @@ impl SafekeeperPostgresHandler {
|
||||
let appname = self.appname.clone();
|
||||
|
||||
// Use a guard object to remove our entry from the timeline when we are done.
|
||||
let ws_guard = Arc::new(tli.get_walsenders().register(
|
||||
self.ttid,
|
||||
*pgb.get_peer_addr(),
|
||||
self.conn_id,
|
||||
self.appname.clone(),
|
||||
));
|
||||
let ws_guard = match self.protocol() {
|
||||
PostgresClientProtocol::Vanilla => Arc::new(tli.get_walsenders().register(
|
||||
WalSenderState::Vanilla(VanillaWalSenderInternalState {
|
||||
ttid: self.ttid,
|
||||
addr: *pgb.get_peer_addr(),
|
||||
conn_id: self.conn_id,
|
||||
appname: self.appname.clone(),
|
||||
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
|
||||
}),
|
||||
)),
|
||||
PostgresClientProtocol::Interpreted { .. } => Arc::new(tli.get_walsenders().register(
|
||||
WalSenderState::Interpreted(InterpretedWalSenderInternalState {
|
||||
public_state: safekeeper_api::models::InterpretedWalSenderState {
|
||||
ttid: self.ttid,
|
||||
shard: self.shard.unwrap(),
|
||||
addr: *pgb.get_peer_addr(),
|
||||
conn_id: self.conn_id,
|
||||
appname: self.appname.clone(),
|
||||
feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
|
||||
},
|
||||
interpreted_wal_reader: None,
|
||||
}),
|
||||
)),
|
||||
};
|
||||
|
||||
// Walsender can operate in one of two modes which we select by
|
||||
// application_name: give only committed WAL (used by pageserver) or all
|
||||
@@ -403,7 +543,7 @@ impl SafekeeperPostgresHandler {
|
||||
pgb,
|
||||
// should succeed since we're already holding another guard
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
appname,
|
||||
appname: appname.clone(),
|
||||
start_pos,
|
||||
end_pos,
|
||||
term,
|
||||
@@ -413,7 +553,7 @@ impl SafekeeperPostgresHandler {
|
||||
send_buf: vec![0u8; MAX_SEND_SIZE],
|
||||
};
|
||||
|
||||
Either::Left(sender.run())
|
||||
FutureExt::boxed(sender.run())
|
||||
}
|
||||
PostgresClientProtocol::Interpreted {
|
||||
format,
|
||||
@@ -421,27 +561,96 @@ impl SafekeeperPostgresHandler {
|
||||
} => {
|
||||
let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
|
||||
let end_watch_view = end_watch.view();
|
||||
let wal_stream_builder = WalReaderStreamBuilder {
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
start_pos,
|
||||
end_pos,
|
||||
term,
|
||||
end_watch,
|
||||
wal_sender_guard: ws_guard.clone(),
|
||||
};
|
||||
let wal_residence_guard = tli.wal_residence_guard().await?;
|
||||
let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(2);
|
||||
let shard = self.shard.unwrap();
|
||||
|
||||
let sender = InterpretedWalSender {
|
||||
format,
|
||||
compression,
|
||||
pgb,
|
||||
wal_stream_builder,
|
||||
end_watch_view,
|
||||
shard: self.shard.unwrap(),
|
||||
pg_version,
|
||||
appname,
|
||||
};
|
||||
if self.conf.wal_reader_fanout && !shard.is_unsharded() {
|
||||
let ws_id = ws_guard.id();
|
||||
ws_guard.walsenders().create_or_update_interpreted_reader(
|
||||
ws_id,
|
||||
start_pos,
|
||||
self.conf.max_delta_for_fanout,
|
||||
{
|
||||
let tx = tx.clone();
|
||||
|reader| {
|
||||
tracing::info!(
|
||||
"Fanning out interpreted wal reader at {}",
|
||||
start_pos
|
||||
);
|
||||
reader
|
||||
.fanout(shard, tx, start_pos)
|
||||
.with_context(|| "Failed to fan out reader")
|
||||
}
|
||||
},
|
||||
|| {
|
||||
tracing::info!("Spawning interpreted wal reader at {}", start_pos);
|
||||
|
||||
Either::Right(sender.run())
|
||||
let wal_stream = StreamingWalReader::new(
|
||||
wal_residence_guard,
|
||||
term,
|
||||
start_pos,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
InterpretedWalReader::spawn(
|
||||
wal_stream, start_pos, tx, shard, pg_version, &appname,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
let sender = InterpretedWalSender {
|
||||
format,
|
||||
compression,
|
||||
appname,
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
start_lsn: start_pos,
|
||||
pgb,
|
||||
end_watch_view,
|
||||
wal_sender_guard: ws_guard.clone(),
|
||||
rx,
|
||||
};
|
||||
|
||||
FutureExt::boxed(sender.run())
|
||||
} else {
|
||||
let wal_reader = StreamingWalReader::new(
|
||||
wal_residence_guard,
|
||||
term,
|
||||
start_pos,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let reader =
|
||||
InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version);
|
||||
|
||||
let sender = InterpretedWalSender {
|
||||
format,
|
||||
compression,
|
||||
appname: appname.clone(),
|
||||
tli: tli.wal_residence_guard().await?,
|
||||
start_lsn: start_pos,
|
||||
pgb,
|
||||
end_watch_view,
|
||||
wal_sender_guard: ws_guard.clone(),
|
||||
rx,
|
||||
};
|
||||
|
||||
FutureExt::boxed(async move {
|
||||
// Sender returns an Err on all code paths.
|
||||
// If the sender finishes first, we will drop the reader future.
|
||||
// If the reader finishes first, the sender will finish too since
|
||||
// the wal sender has dropped.
|
||||
let res = tokio::try_join!(sender.run(), reader.run(start_pos, &appname));
|
||||
match res.map(|_| ()) {
|
||||
Ok(_) => unreachable!("sender finishes with Err by convention"),
|
||||
err_res => err_res,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -470,7 +679,8 @@ impl SafekeeperPostgresHandler {
|
||||
.clone();
|
||||
info!(
|
||||
"finished streaming to {}, feedback={:?}",
|
||||
ws_state.addr, ws_state.feedback,
|
||||
ws_state.get_addr(),
|
||||
ws_state.get_feedback(),
|
||||
);
|
||||
|
||||
// Join pg backend back.
|
||||
@@ -578,6 +788,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
|
||||
/// convenience.
|
||||
async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let metric = WAL_READERS
|
||||
.get_metric_with_label_values(&[
|
||||
"future",
|
||||
self.appname.as_deref().unwrap_or("safekeeper"),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
metric.inc();
|
||||
scopeguard::defer! {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
loop {
|
||||
// Wait for the next portion if it is not there yet, or just
|
||||
// update our end of WAL available for sending value, we
|
||||
@@ -813,7 +1035,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use safekeeper_api::models::FullTransactionId;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -830,13 +1052,13 @@ mod tests {
|
||||
|
||||
// add to wss specified feedback setting other fields to dummy values
|
||||
fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) {
|
||||
let walsender_state = WalSenderState {
|
||||
let walsender_state = WalSenderState::Vanilla(VanillaWalSenderInternalState {
|
||||
ttid: mock_ttid(),
|
||||
addr: mock_addr(),
|
||||
conn_id: 1,
|
||||
appname: None,
|
||||
feedback,
|
||||
};
|
||||
});
|
||||
wss.slots.push(Some(walsender_state))
|
||||
}
|
||||
|
||||
|
||||
@@ -1,20 +1,25 @@
|
||||
//! Defines per timeline data stored persistently (SafeKeeperPersistentState)
|
||||
//! and its wrapper with in memory layer (SafekeeperState).
|
||||
|
||||
use std::{cmp::max, ops::Deref};
|
||||
use std::{cmp::max, ops::Deref, time::SystemTime};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term};
|
||||
use safekeeper_api::{
|
||||
membership::Configuration,
|
||||
models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse},
|
||||
ServerInfo, Term, INITIAL_TERM,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
control_file,
|
||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION},
|
||||
safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION},
|
||||
timeline::TimelineError,
|
||||
wal_backup_partial::{self},
|
||||
};
|
||||
@@ -27,6 +32,8 @@ pub struct TimelinePersistentState {
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(with = "hex")]
|
||||
pub timeline_id: TimelineId,
|
||||
/// Membership configuration.
|
||||
pub mconf: Configuration,
|
||||
/// persistent acceptor state
|
||||
pub acceptor_state: AcceptorState,
|
||||
/// information about server
|
||||
@@ -58,22 +65,15 @@ pub struct TimelinePersistentState {
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
/// informational purposes, we receive it from pageserver (or broker).
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
/// Peers and their state as we remember it. Knowing peers themselves is
|
||||
/// fundamental; but state is saved here only for informational purposes and
|
||||
/// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
/// place to have less file version upgrades).
|
||||
pub peers: PersistedPeers,
|
||||
/// Holds names of partial segments uploaded to remote storage. Used to
|
||||
/// clean up old objects without leaving garbage in remote storage.
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
/// Eviction state of the timeline. If it's Offloaded, we should download
|
||||
/// WAL files from remote storage to serve the timeline.
|
||||
pub eviction_state: EvictionState,
|
||||
pub creation_ts: SystemTime,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
|
||||
|
||||
/// State of the local WAL files. Used to track current timeline state,
|
||||
/// that can be either WAL files are present on disk or last partial segment
|
||||
/// is offloaded to remote storage.
|
||||
@@ -87,12 +87,14 @@ pub enum EvictionState {
|
||||
}
|
||||
|
||||
impl TimelinePersistentState {
|
||||
/// commit_lsn is the same as start_lsn in the normal creaiton; see
|
||||
/// `TimelineCreateRequest` comments.`
|
||||
pub fn new(
|
||||
ttid: &TenantTimelineId,
|
||||
mconf: Configuration,
|
||||
server_info: ServerInfo,
|
||||
peers: Vec<NodeId>,
|
||||
start_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
local_start_lsn: Lsn,
|
||||
) -> anyhow::Result<TimelinePersistentState> {
|
||||
if server_info.wal_seg_size == 0 {
|
||||
bail!(TimelineError::UninitializedWalSegSize(*ttid));
|
||||
@@ -102,49 +104,59 @@ impl TimelinePersistentState {
|
||||
bail!(TimelineError::UninitialinzedPgVersion(*ttid));
|
||||
}
|
||||
|
||||
if commit_lsn < local_start_lsn {
|
||||
if commit_lsn < start_lsn {
|
||||
bail!(
|
||||
"commit_lsn {} is smaller than local_start_lsn {}",
|
||||
"commit_lsn {} is smaller than start_lsn {}",
|
||||
commit_lsn,
|
||||
local_start_lsn
|
||||
start_lsn
|
||||
);
|
||||
}
|
||||
|
||||
// If we are given with init LSN, initialize term history with it. It
|
||||
// ensures that walproposer always must be able to find a common point
|
||||
// in histories; if it can't something is corrupted. Not having LSN here
|
||||
// is so far left for legacy case where timeline is created by compute
|
||||
// and LSN during creation is not known yet.
|
||||
let term_history = if commit_lsn != Lsn::INVALID {
|
||||
TermHistory(vec![TermLsn {
|
||||
term: INITIAL_TERM,
|
||||
lsn: start_lsn,
|
||||
}])
|
||||
} else {
|
||||
TermHistory::empty()
|
||||
};
|
||||
|
||||
Ok(TimelinePersistentState {
|
||||
tenant_id: ttid.tenant_id,
|
||||
timeline_id: ttid.timeline_id,
|
||||
mconf,
|
||||
acceptor_state: AcceptorState {
|
||||
term: 0,
|
||||
term_history: TermHistory::empty(),
|
||||
term: INITIAL_TERM,
|
||||
term_history,
|
||||
},
|
||||
server: server_info,
|
||||
proposer_uuid: [0; 16],
|
||||
timeline_start_lsn: Lsn(0),
|
||||
local_start_lsn,
|
||||
timeline_start_lsn: start_lsn,
|
||||
local_start_lsn: start_lsn,
|
||||
commit_lsn,
|
||||
backup_lsn: local_start_lsn,
|
||||
peer_horizon_lsn: local_start_lsn,
|
||||
backup_lsn: start_lsn,
|
||||
peer_horizon_lsn: start_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(
|
||||
peers
|
||||
.iter()
|
||||
.map(|p| (*p, PersistedPeerInfo::new()))
|
||||
.collect(),
|
||||
),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: SystemTime::now(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn empty() -> Self {
|
||||
TimelinePersistentState::new(
|
||||
&TenantTimelineId::empty(),
|
||||
Configuration::empty(),
|
||||
ServerInfo {
|
||||
pg_version: 170000, /* Postgres server version (major * 10000) */
|
||||
system_id: 0, /* Postgres system identifier */
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
},
|
||||
vec![],
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
@@ -249,6 +261,31 @@ where
|
||||
current_term: after,
|
||||
})
|
||||
}
|
||||
|
||||
/// Switch into membership configuration `to` if it is higher than the
|
||||
/// current one.
|
||||
pub async fn membership_switch(
|
||||
&mut self,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
let before = self.mconf.clone();
|
||||
// Is switch allowed?
|
||||
if to.generation <= self.mconf.generation {
|
||||
info!(
|
||||
"ignoring request to switch membership conf to lower {}, current conf {}",
|
||||
to, self.mconf
|
||||
);
|
||||
} else {
|
||||
let mut state = self.start_change();
|
||||
state.mconf = to.clone();
|
||||
self.finish_change(&state).await?;
|
||||
info!("switched membership conf to {} from {}", to, before);
|
||||
}
|
||||
Ok(TimelineMembershipSwitchResponse {
|
||||
previous_conf: before,
|
||||
current_conf: self.mconf.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<CTRL> Deref for TimelineState<CTRL>
|
||||
|
||||
@@ -1,13 +1,19 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::rate_limit::RateLimiter;
|
||||
use crate::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
|
||||
use crate::receive_wal::WalAcceptor;
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
|
||||
ProposerElected, SafeKeeper, TermHistory,
|
||||
};
|
||||
use crate::send_wal::EndWatch;
|
||||
use crate::state::{TimelinePersistentState, TimelineState};
|
||||
use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup::remote_timeline_path;
|
||||
use crate::{control_file, wal_storage, SafeKeeperConf};
|
||||
use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf};
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
|
||||
use tokio::fs::create_dir_all;
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -107,4 +113,59 @@ impl Env {
|
||||
);
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
// This will be dead code when building a non-benchmark target with the
|
||||
// benchmarking feature enabled.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn write_wal(
|
||||
tli: Arc<Timeline>,
|
||||
start_lsn: Lsn,
|
||||
msg_size: usize,
|
||||
msg_count: usize,
|
||||
) -> anyhow::Result<EndWatch> {
|
||||
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
|
||||
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
|
||||
|
||||
let end_watch = EndWatch::Commit(tli.get_commit_lsn_watch_rx());
|
||||
|
||||
WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
|
||||
|
||||
let prefix = c"p";
|
||||
let prefixlen = prefix.to_bytes_with_nul().len();
|
||||
assert!(msg_size >= prefixlen);
|
||||
let message = vec![0; msg_size - prefixlen];
|
||||
|
||||
let walgen =
|
||||
&mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
|
||||
for _ in 0..msg_count {
|
||||
let (lsn, record) = walgen.next().unwrap();
|
||||
|
||||
let req = AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
term: 1,
|
||||
term_start_lsn: start_lsn,
|
||||
begin_lsn: lsn,
|
||||
end_lsn: lsn + record.len() as u64,
|
||||
commit_lsn: lsn,
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
},
|
||||
wal_data: record,
|
||||
};
|
||||
|
||||
let end_lsn = req.h.end_lsn;
|
||||
|
||||
let msg = ProposerAcceptorMessage::AppendRequest(req);
|
||||
msg_tx.send(msg).await?;
|
||||
while let Some(reply) = reply_rx.recv().await {
|
||||
if let AcceptorProposerMessage::AppendResponse(resp) = reply {
|
||||
if resp.flush_lsn >= end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(end_watch)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,10 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use remote_storage::RemotePath;
|
||||
use safekeeper_api::models::{PeerInfo, TimelineTermBumpResponse};
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::models::{
|
||||
PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse,
|
||||
};
|
||||
use safekeeper_api::Term;
|
||||
use tokio::fs::{self};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -32,7 +35,7 @@ use crate::control_file;
|
||||
use crate::rate_limit::RateLimiter;
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
|
||||
use crate::send_wal::WalSenders;
|
||||
use crate::send_wal::{WalSenders, WalSendersTimelineMetricValues};
|
||||
use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
|
||||
use crate::timeline_guard::ResidenceGuard;
|
||||
use crate::timeline_manager::{AtomicStatus, ManagerCtl};
|
||||
@@ -188,6 +191,13 @@ impl StateSK {
|
||||
self.state_mut().term_bump(to).await
|
||||
}
|
||||
|
||||
pub async fn membership_switch(
|
||||
&mut self,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
self.state_mut().membership_switch(to).await
|
||||
}
|
||||
|
||||
/// Close open WAL files to release FDs.
|
||||
fn close_wal_store(&mut self) {
|
||||
if let StateSK::Loaded(sk) = self {
|
||||
@@ -702,16 +712,22 @@ impl Timeline {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
|
||||
let WalSendersTimelineMetricValues {
|
||||
ps_feedback_counter,
|
||||
last_ps_feedback,
|
||||
interpreted_wal_reader_tasks,
|
||||
} = self.walsenders.info_for_metrics();
|
||||
|
||||
let state = self.read_shared_state().await;
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
ps_feedback_count,
|
||||
ps_feedback_count: ps_feedback_counter,
|
||||
last_ps_feedback,
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
timeline_is_active: self.broker_active.load(Ordering::Relaxed),
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
|
||||
interpreted_wal_reader_tasks,
|
||||
epoch_start_lsn: state.sk.term_start_lsn(),
|
||||
mem_state: state.sk.state().inmem.clone(),
|
||||
persisted_state: TimelinePersistentState::clone(state.sk.state()),
|
||||
@@ -730,7 +746,7 @@ impl Timeline {
|
||||
debug_dump::Memory {
|
||||
is_cancelled: self.is_cancelled(),
|
||||
peers_info_len: state.peers_info.0.len(),
|
||||
walsenders: self.walsenders.get_all(),
|
||||
walsenders: self.walsenders.get_all_public(),
|
||||
wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
|
||||
active: self.broker_active.load(Ordering::Relaxed),
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
@@ -768,6 +784,14 @@ impl Timeline {
|
||||
state.sk.term_bump(to).await
|
||||
}
|
||||
|
||||
pub async fn membership_switch(
|
||||
self: &Arc<Self>,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
let mut state = self.write_shared_state().await;
|
||||
state.sk.membership_switch(to).await
|
||||
}
|
||||
|
||||
/// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
|
||||
async fn do_wal_residence_guard(
|
||||
self: &Arc<Self>,
|
||||
|
||||
@@ -12,6 +12,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::ServerInfo;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
@@ -214,9 +215,10 @@ impl GlobalTimelines {
|
||||
pub(crate) async fn create(
|
||||
&self,
|
||||
ttid: TenantTimelineId,
|
||||
mconf: Configuration,
|
||||
server_info: ServerInfo,
|
||||
start_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
local_start_lsn: Lsn,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let (conf, _, _) = {
|
||||
let state = self.state.lock().unwrap();
|
||||
@@ -239,8 +241,7 @@ impl GlobalTimelines {
|
||||
|
||||
// TODO: currently we create only cfile. It would be reasonable to
|
||||
// immediately initialize first WAL segment as well.
|
||||
let state =
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
|
||||
let state = TimelinePersistentState::new(&ttid, mconf, server_info, start_lsn, commit_lsn)?;
|
||||
control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
|
||||
let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
|
||||
Ok(timeline)
|
||||
|
||||
@@ -1,34 +1,16 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_stream::try_stream;
|
||||
use bytes::Bytes;
|
||||
use futures::Stream;
|
||||
use postgres_backend::CopyStreamHandlerEnd;
|
||||
use safekeeper_api::Term;
|
||||
use std::time::Duration;
|
||||
use tokio::time::timeout;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
send_wal::{EndWatch, WalSenderGuard},
|
||||
timeline::WalResidentTimeline,
|
||||
use std::{
|
||||
pin::Pin,
|
||||
task::{Context, Poll},
|
||||
};
|
||||
|
||||
pub(crate) struct WalReaderStreamBuilder {
|
||||
pub(crate) tli: WalResidentTimeline,
|
||||
pub(crate) start_pos: Lsn,
|
||||
pub(crate) end_pos: Lsn,
|
||||
pub(crate) term: Option<Term>,
|
||||
pub(crate) end_watch: EndWatch,
|
||||
pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
|
||||
}
|
||||
use bytes::Bytes;
|
||||
use futures::{stream::BoxStream, Stream, StreamExt};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
impl WalReaderStreamBuilder {
|
||||
pub(crate) fn start_pos(&self) -> Lsn {
|
||||
self.start_pos
|
||||
}
|
||||
}
|
||||
use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader};
|
||||
use safekeeper_api::Term;
|
||||
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub(crate) struct WalBytes {
|
||||
/// Raw PG WAL
|
||||
pub(crate) wal: Bytes,
|
||||
@@ -44,106 +26,270 @@ pub(crate) struct WalBytes {
|
||||
pub(crate) available_wal_end_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl WalReaderStreamBuilder {
|
||||
/// Builds a stream of Postgres WAL starting from [`Self::start_pos`].
|
||||
/// The stream terminates when the receiver (pageserver) is fully caught up
|
||||
/// and there's no active computes.
|
||||
pub(crate) async fn build(
|
||||
self,
|
||||
buffer_size: usize,
|
||||
) -> anyhow::Result<impl Stream<Item = Result<WalBytes, CopyStreamHandlerEnd>>> {
|
||||
// TODO(vlad): The code below duplicates functionality from [`crate::send_wal`].
|
||||
// We can make the raw WAL sender use this stream too and remove the duplication.
|
||||
let Self {
|
||||
tli,
|
||||
mut start_pos,
|
||||
mut end_pos,
|
||||
term,
|
||||
mut end_watch,
|
||||
wal_sender_guard,
|
||||
} = self;
|
||||
let mut wal_reader = tli.get_walreader(start_pos).await?;
|
||||
let mut buffer = vec![0; buffer_size];
|
||||
struct PositionedWalReader {
|
||||
start: Lsn,
|
||||
end: Lsn,
|
||||
reader: Option<WalReader>,
|
||||
}
|
||||
|
||||
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
|
||||
/// A streaming WAL reader wrapper which can be reset while running
|
||||
pub(crate) struct StreamingWalReader {
|
||||
stream: BoxStream<'static, WalOrReset>,
|
||||
start_changed_tx: tokio::sync::watch::Sender<Lsn>,
|
||||
}
|
||||
|
||||
Ok(try_stream! {
|
||||
loop {
|
||||
let have_something_to_send = end_pos > start_pos;
|
||||
pub(crate) enum WalOrReset {
|
||||
Wal(anyhow::Result<WalBytes>),
|
||||
Reset(Lsn),
|
||||
}
|
||||
|
||||
if !have_something_to_send {
|
||||
// wait for lsn
|
||||
let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await;
|
||||
match res {
|
||||
Ok(ok) => {
|
||||
end_pos = ok?;
|
||||
},
|
||||
Err(_) => {
|
||||
if let EndWatch::Commit(_) = end_watch {
|
||||
if let Some(remote_consistent_lsn) = wal_sender_guard
|
||||
.walsenders()
|
||||
.get_ws_remote_consistent_lsn(wal_sender_guard.id())
|
||||
{
|
||||
if tli.should_walsender_stop(remote_consistent_lsn).await {
|
||||
// Stop streaming if the receivers are caught up and
|
||||
// there's no active compute. This causes the loop in
|
||||
// [`crate::send_interpreted_wal::InterpretedWalSender::run`]
|
||||
// to exit and terminate the WAL stream.
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
assert!(
|
||||
end_pos > start_pos,
|
||||
"nothing to send after waiting for WAL"
|
||||
);
|
||||
|
||||
// try to send as much as available, capped by the buffer size
|
||||
let mut chunk_end_pos = start_pos + buffer_size as u64;
|
||||
// if we went behind available WAL, back off
|
||||
if chunk_end_pos >= end_pos {
|
||||
chunk_end_pos = end_pos;
|
||||
} else {
|
||||
// If sending not up to end pos, round down to page boundary to
|
||||
// avoid breaking WAL record not at page boundary, as protocol
|
||||
// demands. See walsender.c (XLogSendPhysical).
|
||||
chunk_end_pos = chunk_end_pos
|
||||
.checked_sub(chunk_end_pos.block_offset())
|
||||
.unwrap();
|
||||
}
|
||||
let send_size = (chunk_end_pos.0 - start_pos.0) as usize;
|
||||
let buffer = &mut buffer[..send_size];
|
||||
let send_size: usize;
|
||||
{
|
||||
// If uncommitted part is being pulled, check that the term is
|
||||
// still the expected one.
|
||||
let _term_guard = if let Some(t) = term {
|
||||
Some(tli.acquire_term(t).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// Read WAL into buffer. send_size can be additionally capped to
|
||||
// segment boundary here.
|
||||
send_size = wal_reader.read(buffer).await?
|
||||
};
|
||||
let wal = Bytes::copy_from_slice(&buffer[..send_size]);
|
||||
|
||||
yield WalBytes {
|
||||
wal,
|
||||
wal_start_lsn: start_pos,
|
||||
wal_end_lsn: start_pos + send_size as u64,
|
||||
available_wal_end_lsn: end_pos
|
||||
};
|
||||
|
||||
start_pos += send_size as u64;
|
||||
}
|
||||
})
|
||||
impl WalOrReset {
|
||||
pub(crate) fn get_wal(self) -> Option<anyhow::Result<WalBytes>> {
|
||||
match self {
|
||||
WalOrReset::Wal(wal) => Some(wal),
|
||||
WalOrReset::Reset(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StreamingWalReader {
|
||||
pub(crate) fn new(
|
||||
tli: WalResidentTimeline,
|
||||
term: Option<Term>,
|
||||
start: Lsn,
|
||||
end: Lsn,
|
||||
end_watch: EndWatch,
|
||||
buffer_size: usize,
|
||||
) -> Self {
|
||||
let (start_changed_tx, start_changed_rx) = tokio::sync::watch::channel(start);
|
||||
|
||||
let state = WalReaderStreamState {
|
||||
tli,
|
||||
wal_reader: PositionedWalReader {
|
||||
start,
|
||||
end,
|
||||
reader: None,
|
||||
},
|
||||
term,
|
||||
end_watch,
|
||||
buffer: vec![0; buffer_size],
|
||||
buffer_size,
|
||||
};
|
||||
|
||||
// When a change notification is received while polling the internal
|
||||
// reader, stop polling the read future and service the change.
|
||||
let stream = futures::stream::unfold(
|
||||
(state, start_changed_rx),
|
||||
|(mut state, mut rx)| async move {
|
||||
let wal_or_reset = tokio::select! {
|
||||
read_res = state.read() => { WalOrReset::Wal(read_res) },
|
||||
changed_res = rx.changed() => {
|
||||
if changed_res.is_err() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let new_start_pos = rx.borrow_and_update();
|
||||
WalOrReset::Reset(*new_start_pos)
|
||||
}
|
||||
};
|
||||
|
||||
if let WalOrReset::Reset(lsn) = wal_or_reset {
|
||||
state.wal_reader.start = lsn;
|
||||
state.wal_reader.reader = None;
|
||||
}
|
||||
|
||||
Some((wal_or_reset, (state, rx)))
|
||||
},
|
||||
)
|
||||
.boxed();
|
||||
|
||||
Self {
|
||||
stream,
|
||||
start_changed_tx,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset the stream to a given position.
|
||||
pub(crate) async fn reset(&mut self, start: Lsn) {
|
||||
self.start_changed_tx.send(start).unwrap();
|
||||
while let Some(wal_or_reset) = self.stream.next().await {
|
||||
match wal_or_reset {
|
||||
WalOrReset::Reset(at) => {
|
||||
// Stream confirmed the reset.
|
||||
// There may only one ongoing reset at any given time,
|
||||
// hence the assertion.
|
||||
assert_eq!(at, start);
|
||||
break;
|
||||
}
|
||||
WalOrReset::Wal(_) => {
|
||||
// Ignore wal generated before reset was handled
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for StreamingWalReader {
|
||||
type Item = WalOrReset;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
Pin::new(&mut self.stream).poll_next(cx)
|
||||
}
|
||||
}
|
||||
|
||||
struct WalReaderStreamState {
|
||||
tli: WalResidentTimeline,
|
||||
wal_reader: PositionedWalReader,
|
||||
term: Option<Term>,
|
||||
end_watch: EndWatch,
|
||||
buffer: Vec<u8>,
|
||||
buffer_size: usize,
|
||||
}
|
||||
|
||||
impl WalReaderStreamState {
|
||||
async fn read(&mut self) -> anyhow::Result<WalBytes> {
|
||||
// Create reader if needed
|
||||
if self.wal_reader.reader.is_none() {
|
||||
self.wal_reader.reader = Some(self.tli.get_walreader(self.wal_reader.start).await?);
|
||||
}
|
||||
|
||||
let have_something_to_send = self.wal_reader.end > self.wal_reader.start;
|
||||
if !have_something_to_send {
|
||||
tracing::debug!(
|
||||
"Waiting for wal: start={}, end={}",
|
||||
self.wal_reader.end,
|
||||
self.wal_reader.start
|
||||
);
|
||||
self.wal_reader.end = self
|
||||
.end_watch
|
||||
.wait_for_lsn(self.wal_reader.start, self.term)
|
||||
.await?;
|
||||
tracing::debug!(
|
||||
"Done waiting for wal: start={}, end={}",
|
||||
self.wal_reader.end,
|
||||
self.wal_reader.start
|
||||
);
|
||||
}
|
||||
|
||||
assert!(
|
||||
self.wal_reader.end > self.wal_reader.start,
|
||||
"nothing to send after waiting for WAL"
|
||||
);
|
||||
|
||||
// Calculate chunk size
|
||||
let mut chunk_end_pos = self.wal_reader.start + self.buffer_size as u64;
|
||||
if chunk_end_pos >= self.wal_reader.end {
|
||||
chunk_end_pos = self.wal_reader.end;
|
||||
} else {
|
||||
chunk_end_pos = chunk_end_pos
|
||||
.checked_sub(chunk_end_pos.block_offset())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let send_size = (chunk_end_pos.0 - self.wal_reader.start.0) as usize;
|
||||
let buffer = &mut self.buffer[..send_size];
|
||||
|
||||
// Read WAL
|
||||
let send_size = {
|
||||
let _term_guard = if let Some(t) = self.term {
|
||||
Some(self.tli.acquire_term(t).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
self.wal_reader
|
||||
.reader
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.read(buffer)
|
||||
.await?
|
||||
};
|
||||
|
||||
let wal = Bytes::copy_from_slice(&buffer[..send_size]);
|
||||
let result = WalBytes {
|
||||
wal,
|
||||
wal_start_lsn: self.wal_reader.start,
|
||||
wal_end_lsn: self.wal_reader.start + send_size as u64,
|
||||
available_wal_end_lsn: self.wal_reader.end,
|
||||
};
|
||||
|
||||
self.wal_reader.start += send_size as u64;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use futures::StreamExt;
|
||||
use postgres_ffi::MAX_SEND_SIZE;
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_streaming_wal_reader_reset() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const MSG_COUNT: usize = 200;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
tracing::info!("Doing first round of reads ...");
|
||||
|
||||
let mut streaming_wal_reader = StreamingWalReader::new(
|
||||
resident_tli,
|
||||
None,
|
||||
start_lsn,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let mut before_reset = Vec::new();
|
||||
while let Some(wor) = streaming_wal_reader.next().await {
|
||||
let wal = wor.get_wal().unwrap().unwrap();
|
||||
let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
|
||||
before_reset.push(wal);
|
||||
|
||||
if stop {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Resetting the WAL stream ...");
|
||||
|
||||
streaming_wal_reader.reset(start_lsn).await;
|
||||
|
||||
tracing::info!("Doing second round of reads ...");
|
||||
|
||||
let mut after_reset = Vec::new();
|
||||
while let Some(wor) = streaming_wal_reader.next().await {
|
||||
let wal = wor.get_wal().unwrap().unwrap();
|
||||
let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
|
||||
after_reset.push(wal);
|
||||
|
||||
if stop {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(before_reset, after_reset);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ use safekeeper::{
|
||||
wal_storage::Storage,
|
||||
SafeKeeperConf,
|
||||
};
|
||||
use safekeeper_api::ServerInfo;
|
||||
use safekeeper_api::{membership::Configuration, ServerInfo};
|
||||
use tracing::{debug, info_span, warn};
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -96,8 +96,13 @@ impl GlobalMap {
|
||||
let commit_lsn = Lsn::INVALID;
|
||||
let local_start_lsn = Lsn::INVALID;
|
||||
|
||||
let state =
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
|
||||
let state = TimelinePersistentState::new(
|
||||
&ttid,
|
||||
Configuration::empty(),
|
||||
server_info,
|
||||
commit_lsn,
|
||||
local_start_lsn,
|
||||
)?;
|
||||
|
||||
let disk_timeline = self.disk.put_state(&ttid, state);
|
||||
let control_store = DiskStateStorage::new(disk_timeline.clone());
|
||||
@@ -173,6 +178,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
|
||||
control_file_save_interval: Duration::from_secs(1),
|
||||
partial_backup_concurrency: 1,
|
||||
eviction_min_resident: Duration::ZERO,
|
||||
wal_reader_fanout: false,
|
||||
max_delta_for_fanout: None,
|
||||
};
|
||||
|
||||
let mut global = GlobalMap::new(disk, conf.clone())?;
|
||||
|
||||
@@ -15,7 +15,7 @@ use metrics::{BuildInfo, NeonMetrics};
|
||||
use pageserver_api::controller_api::{
|
||||
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
|
||||
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
|
||||
ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
|
||||
@@ -653,6 +653,10 @@ async fn handle_tenant_list(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let limit: Option<usize> = parse_query_param(&req, "limit")?;
|
||||
let start_after: Option<TenantId> = parse_query_param(&req, "start_after")?;
|
||||
tracing::info!("start_after: {:?}", start_after);
|
||||
|
||||
match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
@@ -660,7 +664,7 @@ async fn handle_tenant_list(
|
||||
ForwardOutcome::NotForwarded(_req) => {}
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, service.tenant_list())
|
||||
json_response(StatusCode::OK, service.tenant_list(limit, start_after))
|
||||
}
|
||||
|
||||
async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -1301,6 +1305,35 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Sets the scheduling policy of the specified safekeeper
|
||||
async fn handle_safekeeper_scheduling_policy(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let body = json_request::<SafekeeperSchedulingPolicyRequest>(&mut req).await?;
|
||||
let id = parse_request_param::<i64>(&req, "id")?;
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
state
|
||||
.service
|
||||
.set_safekeeper_scheduling_policy(id, body.scheduling_policy)
|
||||
.await?;
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::NO_CONTENT)
|
||||
.body(Body::empty())
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(
|
||||
@@ -1869,7 +1902,18 @@ pub fn make_router(
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id", |r| {
|
||||
// id is in the body
|
||||
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
|
||||
named_request_span(
|
||||
r,
|
||||
handle_upsert_safekeeper,
|
||||
RequestName("v1_safekeeper_post"),
|
||||
)
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id/scheduling_policy", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_safekeeper_scheduling_policy,
|
||||
RequestName("v1_safekeeper_status"),
|
||||
)
|
||||
})
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
|
||||
@@ -1104,6 +1104,37 @@ impl Persistence {
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn set_safekeeper_scheduling_policy(
|
||||
&self,
|
||||
id_: i64,
|
||||
scheduling_policy_: SkSchedulingPolicy,
|
||||
) -> Result<(), DatabaseError> {
|
||||
use crate::schema::safekeepers::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
#[derive(Insertable, AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::safekeepers)]
|
||||
struct UpdateSkSchedulingPolicy<'a> {
|
||||
id: i64,
|
||||
scheduling_policy: &'a str,
|
||||
}
|
||||
let scheduling_policy_ = String::from(scheduling_policy_);
|
||||
|
||||
let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
|
||||
.set(scheduling_policy.eq(scheduling_policy_))
|
||||
.execute(conn)?;
|
||||
|
||||
if rows_affected != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"unexpected number of rows ({rows_affected})",
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
|
||||
@@ -47,7 +47,7 @@ use pageserver_api::{
|
||||
AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
|
||||
NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
|
||||
ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
|
||||
ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
|
||||
TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
|
||||
TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse,
|
||||
@@ -4158,17 +4158,42 @@ impl Service {
|
||||
.ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
|
||||
/// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
|
||||
/// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
|
||||
/// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses
|
||||
/// in our external API.
|
||||
pub(crate) fn tenant_list(
|
||||
&self,
|
||||
limit: Option<usize>,
|
||||
start_after: Option<TenantId>,
|
||||
) -> Vec<TenantDescribeResponse> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
// Apply start_from parameter
|
||||
let shard_range = match start_after {
|
||||
None => locked.tenants.range(..),
|
||||
Some(tenant_id) => locked.tenants.range(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(u8::MAX),
|
||||
shard_count: ShardCount(u8::MAX),
|
||||
}..,
|
||||
),
|
||||
};
|
||||
|
||||
let mut result = Vec::new();
|
||||
for (_tenant_id, tenant_shards) in
|
||||
&locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
|
||||
{
|
||||
for (_tenant_id, tenant_shards) in &shard_range.group_by(|(id, _shard)| id.tenant_id) {
|
||||
result.push(
|
||||
self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
|
||||
.expect("Groups are always non-empty"),
|
||||
);
|
||||
|
||||
// Enforce `limit` parameter
|
||||
if let Some(limit) = limit {
|
||||
if result.len() >= limit {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
@@ -7626,6 +7651,16 @@ impl Service {
|
||||
self.persistence.safekeeper_upsert(record).await
|
||||
}
|
||||
|
||||
pub(crate) async fn set_safekeeper_scheduling_policy(
|
||||
&self,
|
||||
id: i64,
|
||||
scheduling_policy: SkSchedulingPolicy,
|
||||
) -> Result<(), DatabaseError> {
|
||||
self.persistence
|
||||
.set_safekeeper_scheduling_policy(id, scheduling_policy)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn update_shards_preferred_azs(
|
||||
&self,
|
||||
req: ShardsPreferredAzsRequest,
|
||||
|
||||
@@ -1884,7 +1884,10 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def tenant_list(self):
|
||||
def tenant_shard_dump(self):
|
||||
"""
|
||||
Debug listing API: dumps the internal map of tenant shards
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/debug/v1/tenant",
|
||||
@@ -1892,6 +1895,18 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def tenant_list(self, **kwargs):
|
||||
"""
|
||||
Control API tenant listing: a vector of the same content returned by tenant_describe
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/tenant",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
params=kwargs,
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def node_configure(self, node_id, body: dict[str, Any]):
|
||||
log.info(f"node_configure({node_id}, {body})")
|
||||
body["node_id"] = node_id
|
||||
@@ -2238,7 +2253,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
"""
|
||||
Get the intent and observed placements of all tenants known to the storage controller.
|
||||
"""
|
||||
tenants = self.tenant_list()
|
||||
tenants = self.tenant_shard_dump()
|
||||
|
||||
tenant_placement: defaultdict[str, dict[str, Any]] = defaultdict(
|
||||
lambda: {
|
||||
@@ -2321,6 +2336,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
json=body,
|
||||
)
|
||||
|
||||
def safekeeper_scheduling_policy(self, id: int, scheduling_policy: str):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/control/v1/safekeeper/{id}/scheduling_policy",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
json={"id": id, "scheduling_policy": scheduling_policy},
|
||||
)
|
||||
|
||||
def get_safekeeper(self, id: int) -> dict[str, Any] | None:
|
||||
try:
|
||||
response = self.request(
|
||||
@@ -4120,7 +4143,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
|
||||
# Checkpoints running endpoint and returns pg_wal size in MB.
|
||||
def get_pg_wal_size(self):
|
||||
log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
|
||||
log.info(f"checkpointing at LSN {self.safe_psql('select pg_current_wal_lsn()')[0][0]}")
|
||||
self.safe_psql("checkpoint")
|
||||
assert self.pgdata_dir is not None # please mypy
|
||||
return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
|
||||
@@ -4960,7 +4983,7 @@ def logical_replication_sync(
|
||||
if res:
|
||||
log.info(f"subscriber_lsn={res}")
|
||||
subscriber_lsn = Lsn(res)
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
|
||||
if subscriber_lsn >= publisher_lsn:
|
||||
return subscriber_lsn
|
||||
time.sleep(0.5)
|
||||
|
||||
@@ -15,7 +15,6 @@ from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from fixtures.common_types import (
|
||||
Id,
|
||||
Lsn,
|
||||
TenantId,
|
||||
TenantShardId,
|
||||
@@ -25,7 +24,7 @@ from fixtures.common_types import (
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import Fn
|
||||
from fixtures.utils import EnhancedJSONEncoder, Fn
|
||||
|
||||
|
||||
class PageserverApiException(Exception):
|
||||
@@ -83,14 +82,6 @@ class TimelineCreateRequest:
|
||||
mode: TimelineCreateRequestMode
|
||||
|
||||
def to_json(self) -> str:
|
||||
class EnhancedJSONEncoder(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if dataclasses.is_dataclass(o) and not isinstance(o, type):
|
||||
return dataclasses.asdict(o)
|
||||
elif isinstance(o, Id):
|
||||
return o.id.hex()
|
||||
return super().default(o)
|
||||
|
||||
# mode is flattened
|
||||
this = dataclasses.asdict(self)
|
||||
mode = this.pop("mode")
|
||||
|
||||
@@ -10,7 +10,7 @@ import requests
|
||||
from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.utils import EnhancedJSONEncoder, wait_until
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any
|
||||
@@ -25,6 +25,7 @@ class Walreceiver:
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
mconf: Configuration | None
|
||||
term: int
|
||||
last_log_term: int
|
||||
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
@@ -69,6 +70,56 @@ class TermBumpResponse:
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperId:
|
||||
id: int
|
||||
host: str
|
||||
pg_port: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class Configuration:
|
||||
generation: int
|
||||
members: list[SafekeeperId]
|
||||
new_members: list[SafekeeperId] | None
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: dict[str, Any]) -> Configuration:
|
||||
generation = d["generation"]
|
||||
members = d["members"]
|
||||
new_members = d.get("new_members")
|
||||
return Configuration(generation, members, new_members)
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self, cls=EnhancedJSONEncoder)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimelineCreateRequest:
|
||||
tenant_id: TenantId
|
||||
timeline_id: TimelineId
|
||||
mconf: Configuration
|
||||
# not exactly PgVersion, for example 150002 for 15.2
|
||||
pg_version: int
|
||||
start_lsn: Lsn
|
||||
commit_lsn: Lsn | None
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self, cls=EnhancedJSONEncoder)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimelineMembershipSwitchResponse:
|
||||
previous_conf: Configuration
|
||||
current_conf: Configuration
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
|
||||
previous_conf = Configuration.from_json(d["previous_conf"])
|
||||
current_conf = Configuration.from_json(d["current_conf"])
|
||||
return TimelineMembershipSwitchResponse(previous_conf, current_conf)
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
HTTPError = requests.HTTPError
|
||||
|
||||
@@ -131,20 +182,8 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
resj = res.json()
|
||||
return [TenantTimelineId.from_json(ttidj) for ttidj in resj]
|
||||
|
||||
def timeline_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
commit_lsn: Lsn,
|
||||
):
|
||||
body = {
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"pg_version": pg_version,
|
||||
"commit_lsn": str(commit_lsn),
|
||||
}
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
|
||||
def timeline_create(self, r: TimelineCreateRequest):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", data=r.to_json())
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_status(
|
||||
@@ -154,7 +193,10 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
# It is always normally not None, it is allowed only to make forward compat tests happy.
|
||||
mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
|
||||
return SafekeeperTimelineStatus(
|
||||
mconf=mconf,
|
||||
term=resj["acceptor_state"]["term"],
|
||||
last_log_term=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
@@ -180,6 +222,11 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
|
||||
return self.timeline_status(tenant_id, timeline_id).commit_lsn
|
||||
|
||||
# Get timeline membership configuration.
|
||||
def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
|
||||
# make mypy happy
|
||||
return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore
|
||||
|
||||
# only_local doesn't remove segments in the remote storage.
|
||||
def timeline_delete(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
|
||||
@@ -226,6 +273,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def membership_switch(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
|
||||
) -> TimelineMembershipSwitchResponse:
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
|
||||
data=to.to_json(),
|
||||
)
|
||||
res.raise_for_status()
|
||||
return TimelineMembershipSwitchResponse.from_json(res.json())
|
||||
|
||||
def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: dict[str, Any]):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import dataclasses
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -21,6 +22,7 @@ import zstandard
|
||||
from psycopg2.extensions import cursor
|
||||
from typing_extensions import override
|
||||
|
||||
from fixtures.common_types import Id, Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.common_types import (
|
||||
parse_delta_layer,
|
||||
@@ -605,6 +607,22 @@ class PropagatingThread(threading.Thread):
|
||||
return self.ret
|
||||
|
||||
|
||||
class EnhancedJSONEncoder(json.JSONEncoder):
|
||||
"""
|
||||
Default json.JSONEncoder works only on primitive builtins. Extend it to any
|
||||
dataclass plus our custom types.
|
||||
"""
|
||||
|
||||
def default(self, o):
|
||||
if dataclasses.is_dataclass(o) and not isinstance(o, type):
|
||||
return dataclasses.asdict(o)
|
||||
elif isinstance(o, Id):
|
||||
return o.id.hex()
|
||||
elif isinstance(o, Lsn):
|
||||
return str(o) # standard hex notation
|
||||
return super().default(o)
|
||||
|
||||
|
||||
def human_bytes(amt: float) -> str:
|
||||
"""
|
||||
Render a bytes amount into nice IEC bytes string.
|
||||
|
||||
@@ -53,6 +53,22 @@ class Workload:
|
||||
self._endpoint: Endpoint | None = None
|
||||
self._endpoint_opts = endpoint_opts or {}
|
||||
|
||||
def branch(
|
||||
self,
|
||||
timeline_id: TimelineId,
|
||||
branch_name: str | None = None,
|
||||
endpoint_opts: dict[str, Any] | None = None,
|
||||
) -> Workload:
|
||||
"""
|
||||
Checkpoint the current status of the workload in case of branching
|
||||
"""
|
||||
branch_workload = Workload(
|
||||
self.env, self.tenant_id, timeline_id, branch_name, endpoint_opts
|
||||
)
|
||||
branch_workload.expect_rows = self.expect_rows
|
||||
branch_workload.churn_cursor = self.churn_cursor
|
||||
return branch_workload
|
||||
|
||||
def reconfigure(self) -> None:
|
||||
"""
|
||||
Request the endpoint to reconfigure based on location reported by storage controller
|
||||
|
||||
@@ -112,7 +112,11 @@ page_cache_size=10
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize(
|
||||
"with_branches",
|
||||
["with_branches", "no_branches"],
|
||||
)
|
||||
def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_branches: str):
|
||||
SMOKE_CONF = {
|
||||
# Run both gc and gc-compaction.
|
||||
"gc_period": "5s",
|
||||
@@ -143,12 +147,17 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
log.info("Writing initial data ...")
|
||||
workload.write_rows(row_count, env.pageserver.id)
|
||||
|
||||
child_workloads: list[Workload] = []
|
||||
|
||||
for i in range(1, churn_rounds + 1):
|
||||
if i % 10 == 0:
|
||||
log.info(f"Running churn round {i}/{churn_rounds} ...")
|
||||
|
||||
if (i - 1) % 10 == 0:
|
||||
# Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
|
||||
if i % 10 == 5 and with_branches == "with_branches":
|
||||
branch_name = f"child-{i}"
|
||||
branch_timeline_id = env.create_branch(branch_name)
|
||||
child_workloads.append(workload.branch(branch_timeline_id, branch_name))
|
||||
if (i - 1) % 10 == 0 or (i - 1) % 10 == 1:
|
||||
# Run gc-compaction twice every 10 rounds to ensure the test doesn't take too long time.
|
||||
ps_http.timeline_compact(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
@@ -179,6 +188,9 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
log.info("Validating at workload end ...")
|
||||
workload.validate(env.pageserver.id)
|
||||
for child_workload in child_workloads:
|
||||
log.info(f"Validating at branch {child_workload.branch_name}")
|
||||
child_workload.validate(env.pageserver.id)
|
||||
|
||||
# Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
|
||||
|
||||
@@ -1,60 +0,0 @@
|
||||
# NB: there are benchmarks that double-serve as tests inside the `performance` directory.
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
@pytest.mark.timeout(30) # test takes <20s if pageserver impl is correct
|
||||
@pytest.mark.parametrize("kind", ["pageserver-stop", "tenant-detach"])
|
||||
def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: str):
|
||||
def patch_pageserver_toml(config):
|
||||
config["page_service_pipelining"] = {
|
||||
"mode": "pipelined",
|
||||
"max_batch_size": 32,
|
||||
"execution": "concurrent-futures",
|
||||
}
|
||||
|
||||
neon_env_builder.pageserver_config_override = patch_pageserver_toml
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
log.info("make flush appear slow")
|
||||
|
||||
log.info("sending requests until pageserver accepts no more")
|
||||
# TODO: extract this into a helper, like subprocess_capture,
|
||||
# so that we capture the stderr from the helper somewhere.
|
||||
child = subprocess.Popen(
|
||||
[
|
||||
neon_binpath / "test_helper_slow_client_reads",
|
||||
env.pageserver.connstr(),
|
||||
str(env.initial_tenant),
|
||||
str(env.initial_timeline),
|
||||
],
|
||||
bufsize=0, # unbuffered
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
)
|
||||
assert child.stdout is not None
|
||||
buf = child.stdout.read(1)
|
||||
if len(buf) != 1:
|
||||
raise Exception("unexpected EOF")
|
||||
if buf != b"R":
|
||||
raise Exception(f"unexpected data: {buf!r}")
|
||||
log.info("helper reports pageserver accepts no more requests")
|
||||
log.info(
|
||||
"assuming pageserver connection handle is in a state where TCP has backpressured pageserver=>client response flush() into userspace"
|
||||
)
|
||||
|
||||
if kind == "pageserver-stop":
|
||||
log.info("try to shut down the pageserver cleanly")
|
||||
env.pageserver.stop()
|
||||
elif kind == "tenant-detach":
|
||||
log.info("try to shut down the tenant")
|
||||
env.pageserver.tenant_detach(env.initial_tenant)
|
||||
else:
|
||||
raise ValueError(f"unexpected kind: {kind}")
|
||||
|
||||
log.info("shutdown did not time out, test passed")
|
||||
@@ -113,6 +113,19 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination)
|
||||
for tid in tenant_ids:
|
||||
env.create_tenant(tid, shard_count=shards_per_tenant)
|
||||
|
||||
# Tenant listing API should work
|
||||
listed_tenants = env.storage_controller.tenant_list()
|
||||
log.info(f"listed_tenants: {listed_tenants}")
|
||||
assert set(t["tenant_id"] for t in listed_tenants) == set(str(t) for t in tenant_ids)
|
||||
paged = env.storage_controller.tenant_list(limit=2, start_after=listed_tenants[0]["tenant_id"])
|
||||
assert len(paged) == 2
|
||||
assert paged[0] == listed_tenants[1]
|
||||
assert paged[1] == listed_tenants[2]
|
||||
paged = env.storage_controller.tenant_list(
|
||||
limit=1000, start_after="ffffffffffffffffffffffffffffffff"
|
||||
)
|
||||
assert paged == []
|
||||
|
||||
# Validate high level metrics
|
||||
assert (
|
||||
env.storage_controller.get_metric_value("storage_controller_tenant_shards")
|
||||
@@ -1506,7 +1519,7 @@ class PageserverFailpoint(Failure):
|
||||
|
||||
|
||||
def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
|
||||
tenants = env.storage_controller.tenant_list()
|
||||
tenants = env.storage_controller.tenant_shard_dump()
|
||||
|
||||
node_to_tenants: dict[int, list[TenantId]] = {}
|
||||
for t in tenants:
|
||||
@@ -2631,7 +2644,7 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
|
||||
# Validate that the storcon attempts to forward the request, but stops.
|
||||
# when it realises it is still the current leader.
|
||||
with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"):
|
||||
env.storage_controller.tenant_list()
|
||||
env.storage_controller.tenant_shard_dump()
|
||||
|
||||
# Validate that we can step down multiple times and the observed state
|
||||
# doesn't change.
|
||||
@@ -2781,7 +2794,7 @@ def test_storage_controller_leadership_transfer(
|
||||
# Check that the stepped down instance forwards requests
|
||||
# to the new leader while it's still running.
|
||||
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
|
||||
env.storage_controller.tenant_list()
|
||||
env.storage_controller.tenant_shard_dump()
|
||||
env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
|
||||
status = env.storage_controller.node_status(env.pageservers[0].id)
|
||||
assert status["scheduling"] == "Pause"
|
||||
@@ -3195,6 +3208,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
assert eq_safekeeper_records(body, inserted_now)
|
||||
|
||||
# some small tests for the scheduling policy querying and returning APIs
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Disabled"
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
|
||||
newest_info = target.get_safekeeper(inserted["id"])
|
||||
assert newest_info
|
||||
assert newest_info["scheduling_policy"] == "Decomissioned"
|
||||
# Ensure idempotency
|
||||
target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
|
||||
|
||||
|
||||
def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
|
||||
compared = [dict(a), dict(b)]
|
||||
|
||||
@@ -48,7 +48,12 @@ from fixtures.remote_storage import (
|
||||
default_remote_storage,
|
||||
s3_storage,
|
||||
)
|
||||
from fixtures.safekeeper.http import SafekeeperHttpClient
|
||||
from fixtures.safekeeper.http import (
|
||||
Configuration,
|
||||
SafekeeperHttpClient,
|
||||
SafekeeperId,
|
||||
TimelineCreateRequest,
|
||||
)
|
||||
from fixtures.safekeeper.utils import wait_walreceivers_absent
|
||||
from fixtures.utils import (
|
||||
PropagatingThread,
|
||||
@@ -658,7 +663,13 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
cli = sk.http_client()
|
||||
cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn)
|
||||
mconf = Configuration(generation=0, members=[], new_members=None)
|
||||
# set start_lsn to the beginning of the first segment to allow reading
|
||||
# WAL from there (could you intidb LSN as well).
|
||||
r = TimelineCreateRequest(
|
||||
tenant_id, timeline_id, mconf, pg_version, Lsn("0/1000000"), commit_lsn=last_lsn
|
||||
)
|
||||
cli.timeline_create(r)
|
||||
f_partial_path = (
|
||||
Path(sk.data_dir) / str(tenant_id) / str(timeline_id) / f_partial_saved.name
|
||||
)
|
||||
@@ -2237,6 +2248,63 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
|
||||
wait_until(unevicted_on_dest, interval=0.1, timeout=1.0)
|
||||
|
||||
|
||||
# Basic test for http API membership related calls: create timeline and switch
|
||||
# configuration. Normally these are called by storage controller, but this
|
||||
# allows to test them separately.
|
||||
@run_only_on_default_postgres("tests only safekeeper API")
|
||||
def test_membership_api(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
sk = env.safekeepers[0]
|
||||
http_cli = sk.http_client()
|
||||
|
||||
sk_id_1 = SafekeeperId(env.safekeepers[0].id, "localhost", sk.port.pg_tenant_only)
|
||||
sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock
|
||||
|
||||
# Request to switch before timeline creation should fail.
|
||||
init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None)
|
||||
with pytest.raises(requests.exceptions.HTTPError):
|
||||
http_cli.membership_switch(tenant_id, timeline_id, init_conf)
|
||||
|
||||
# Create timeline.
|
||||
create_r = TimelineCreateRequest(
|
||||
tenant_id, timeline_id, init_conf, 150002, Lsn("0/1000000"), commit_lsn=None
|
||||
)
|
||||
log.info(f"sending {create_r.to_json()}")
|
||||
http_cli.timeline_create(create_r)
|
||||
|
||||
# Switch into some conf.
|
||||
joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
|
||||
resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf)
|
||||
log.info(f"joint switch resp: {resp}")
|
||||
assert resp.previous_conf.generation == 1
|
||||
assert resp.current_conf.generation == 4
|
||||
|
||||
# Restart sk, conf should be preserved.
|
||||
sk.stop().start()
|
||||
after_restart = http_cli.get_membership(tenant_id, timeline_id)
|
||||
log.info(f"conf after restart: {after_restart}")
|
||||
assert after_restart.generation == 4
|
||||
|
||||
# Switch into disjoint conf.
|
||||
non_joint = Configuration(generation=5, members=[sk_id_2], new_members=None)
|
||||
resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint)
|
||||
log.info(f"non joint switch resp: {resp}")
|
||||
assert resp.previous_conf.generation == 4
|
||||
assert resp.current_conf.generation == 5
|
||||
|
||||
# Switch request to lower conf should be ignored.
|
||||
lower_conf = Configuration(generation=3, members=[], new_members=None)
|
||||
resp = http_cli.membership_switch(tenant_id, timeline_id, lower_conf)
|
||||
log.info(f"lower switch resp: {resp}")
|
||||
assert resp.previous_conf.generation == 5
|
||||
assert resp.current_conf.generation == 5
|
||||
|
||||
|
||||
# In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
|
||||
# when compute is active, but there are no writes to the timeline. In that case
|
||||
# pageserver should maintain a single connection to safekeeper and don't attempt
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 210a0ba3af...46082f2088
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: d3141e17a7...dd0b28d6fb
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: f63b141cfb...d674efd776
2
vendor/postgres-v17
vendored
2
vendor/postgres-v17
vendored
Submodule vendor/postgres-v17 updated: 0f8da73ed0...a8dd6e779d
8
vendor/revisions.json
vendored
8
vendor/revisions.json
vendored
@@ -1,18 +1,18 @@
|
||||
{
|
||||
"v17": [
|
||||
"17.2",
|
||||
"0f8da73ed08d4fc4ee58cccea008c75bfb20baa8"
|
||||
"a8dd6e779dde907778006adb436b557ad652fb97"
|
||||
],
|
||||
"v16": [
|
||||
"16.6",
|
||||
"f63b141cfb0c813725a6b2574049565bff643018"
|
||||
"d674efd776f59d78e8fa1535bd2f95c3e6984fca"
|
||||
],
|
||||
"v15": [
|
||||
"15.10",
|
||||
"d3141e17a7155e3d07c8deba4a10c748a29ba1e6"
|
||||
"dd0b28d6fbad39e227f3b77296fcca879af8b3a9"
|
||||
],
|
||||
"v14": [
|
||||
"14.15",
|
||||
"210a0ba3afd8134ea910b203f274b165bd4f05d7"
|
||||
"46082f20884f087a2d974b33ac65d63af26142bd"
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user