mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-11 22:50:37 +00:00
Compare commits
511 Commits
conrad/pro
...
release-62
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1a36516d75 | ||
|
|
fde8aa103e | ||
|
|
8624aabc98 | ||
|
|
3a10bf8c82 | ||
|
|
1758c10dec | ||
|
|
7eb3d6bb2d | ||
|
|
3833e30d44 | ||
|
|
4631179320 | ||
|
|
4eea3ce705 | ||
|
|
a9bcabe503 | ||
|
|
7a2625b803 | ||
|
|
f51dc6a44e | ||
|
|
a22361b57b | ||
|
|
1e6a1ac9fa | ||
|
|
02e8fd0b52 | ||
|
|
8adc4031d0 | ||
|
|
46379cd3f2 | ||
|
|
b3a76d9601 | ||
|
|
6c1bbe8434 | ||
|
|
a006f7656e | ||
|
|
31122adee3 | ||
|
|
311cc71b08 | ||
|
|
0356fc426b | ||
|
|
35738ca37f | ||
|
|
fa24d27d38 | ||
|
|
fb6c1e9390 | ||
|
|
d1d4631c8f | ||
|
|
b87a1384f0 | ||
|
|
5702e1cb46 | ||
|
|
5be3e09082 | ||
|
|
cd3f4b3a53 | ||
|
|
57f22178d7 | ||
|
|
3f05758d09 | ||
|
|
010203a49e | ||
|
|
7c40266c82 | ||
|
|
7b3f94c1f0 | ||
|
|
d8205248e2 | ||
|
|
a4d3e0c747 | ||
|
|
df0748289b | ||
|
|
407bf968c1 | ||
|
|
e0a5bb17ed | ||
|
|
6026cbfb63 | ||
|
|
3a0ee16ed5 | ||
|
|
dbcfc01471 | ||
|
|
8bf597c4d7 | ||
|
|
138ae15a91 | ||
|
|
59eeadabe9 | ||
|
|
daf8edd986 | ||
|
|
a1272b6ed8 | ||
|
|
28ee7cdede | ||
|
|
7b63092958 | ||
|
|
31bfeaf934 | ||
|
|
21b3a191bf | ||
|
|
f7f9b4aaec | ||
|
|
bba062e262 | ||
|
|
067363fe95 | ||
|
|
affe408433 | ||
|
|
9b883e4651 | ||
|
|
b98b301d56 | ||
|
|
ed7ee73cba | ||
|
|
fceace835b | ||
|
|
1b508a6082 | ||
|
|
f87b031876 | ||
|
|
9f1ba2c4bf | ||
|
|
9868bb3346 | ||
|
|
27da0e9cf5 | ||
|
|
de9bf2af6c | ||
|
|
3d2c2ce139 | ||
|
|
82a2081d61 | ||
|
|
ff174a88c0 | ||
|
|
ef3ebfaf67 | ||
|
|
ae1af558b4 | ||
|
|
c150ad4ee2 | ||
|
|
a98ccd185b | ||
|
|
9f796ebba9 | ||
|
|
d51ca338c4 | ||
|
|
07e78102bf | ||
|
|
b21e131d11 | ||
|
|
abe3b4e005 | ||
|
|
18e7c2b7a1 | ||
|
|
ad5d784fb7 | ||
|
|
85d47637ee | ||
|
|
7e818ee390 | ||
|
|
bff505426e | ||
|
|
bf7de92dc2 | ||
|
|
9dc71f5a88 | ||
|
|
2ede9d7a25 | ||
|
|
ea5460843c | ||
|
|
5b16624bcc | ||
|
|
349373cb11 | ||
|
|
957f99cad5 | ||
|
|
2a3a136474 | ||
|
|
cfaf30f5e8 | ||
|
|
72c2d0812e | ||
|
|
537ecf45f8 | ||
|
|
1637a6ee05 | ||
|
|
d74fb7b879 | ||
|
|
7973c3e941 | ||
|
|
085bbaf5f8 | ||
|
|
85b5219861 | ||
|
|
7472c69954 | ||
|
|
3f8819827c | ||
|
|
c440756410 | ||
|
|
0e600eb921 | ||
|
|
a1df835e28 | ||
|
|
119ddf6ccf | ||
|
|
90f447b79d | ||
|
|
7dd71f4126 | ||
|
|
8532d72276 | ||
|
|
d3ff47f572 | ||
|
|
8cc768254f | ||
|
|
5c80743c9c | ||
|
|
5bba3e3c75 | ||
|
|
6caf702417 | ||
|
|
32f668f5e7 | ||
|
|
a91f9d5832 | ||
|
|
547acde6cd | ||
|
|
bea6532881 | ||
|
|
8e2fe6b22e | ||
|
|
4d75e1ef81 | ||
|
|
4c7c00268c | ||
|
|
f28abb953d | ||
|
|
4df39d7304 | ||
|
|
bfc7338246 | ||
|
|
35dac6e6c8 | ||
|
|
e619e8703e | ||
|
|
6fd35bfe32 | ||
|
|
547a431b0d | ||
|
|
f8c01c6341 | ||
|
|
1145700f87 | ||
|
|
44339f5b70 | ||
|
|
7b4a9c1d82 | ||
|
|
3b2fc27de4 | ||
|
|
0b6492e7d3 | ||
|
|
7cfaecbeb6 | ||
|
|
472acae615 | ||
|
|
108bf56e44 | ||
|
|
e83a499ab4 | ||
|
|
ebf3bfadde | ||
|
|
ab06240fae | ||
|
|
cec216c5c0 | ||
|
|
930201e033 | ||
|
|
8328580dc2 | ||
|
|
8d9b632f2a | ||
|
|
55d37c77b9 | ||
|
|
0948fb6bf1 | ||
|
|
285c6d2974 | ||
|
|
a5491463e1 | ||
|
|
a58827f952 | ||
|
|
36b790f282 | ||
|
|
3ef7748e6b | ||
|
|
f3310143e4 | ||
|
|
05b4169644 | ||
|
|
d1495755e7 | ||
|
|
c8dd78c6c8 | ||
|
|
b44ee3950a | ||
|
|
64334f497d | ||
|
|
5ffcb688cc | ||
|
|
32fc2dd683 | ||
|
|
d35ddfbab7 | ||
|
|
3ee82a9895 | ||
|
|
e770aeee92 | ||
|
|
32828cddd6 | ||
|
|
bd2046e1ab | ||
|
|
7e2a3d2728 | ||
|
|
0e4832308d | ||
|
|
0a63bc4818 | ||
|
|
2897dcc9aa | ||
|
|
1d0ec50ddb | ||
|
|
a86b43fcd7 | ||
|
|
b917868ada | ||
|
|
7b7d16f52e | ||
|
|
fee4169b6b | ||
|
|
47e06a2cc6 | ||
|
|
c4423c0623 | ||
|
|
a11cf03123 | ||
|
|
08b33adfee | ||
|
|
4fb50144dd | ||
|
|
c500137ca9 | ||
|
|
252c4acec9 | ||
|
|
db70c175e6 | ||
|
|
ed3b4a58b4 | ||
|
|
2863d1df63 | ||
|
|
320b24eab3 | ||
|
|
13a8a5b09b | ||
|
|
64ccdf65e0 | ||
|
|
1ae6aa09dd | ||
|
|
aeb68e51df | ||
|
|
c3e5223a5d | ||
|
|
daaa3211a4 | ||
|
|
7ff9989dd5 | ||
|
|
ed3b97604c | ||
|
|
47c50ec460 | ||
|
|
8c0ec2f681 | ||
|
|
588bda98e7 | ||
|
|
504ca7720f | ||
|
|
cf4ea92aad | ||
|
|
325294bced | ||
|
|
86c8ba2563 | ||
|
|
feeb2dc6fa | ||
|
|
57f476ff5a | ||
|
|
7ee2bebdb7 | ||
|
|
be598f1bf4 | ||
|
|
939b5954a5 | ||
|
|
371020fe6a | ||
|
|
f45818abed | ||
|
|
0384267d58 | ||
|
|
62b3bd968a | ||
|
|
e3e3bc3542 | ||
|
|
be014a2222 | ||
|
|
2e1fe71cc0 | ||
|
|
068c158ca5 | ||
|
|
b16e4f689f | ||
|
|
dbff725a0c | ||
|
|
7fa4628434 | ||
|
|
fc538a38b9 | ||
|
|
c2e7cb324f | ||
|
|
101043122e | ||
|
|
c4d7d59825 | ||
|
|
0de1e1d664 | ||
|
|
271598b77f | ||
|
|
459bc479dc | ||
|
|
c213373a59 | ||
|
|
e0addc100d | ||
|
|
0519138b04 | ||
|
|
5da39b469c | ||
|
|
82027e22dd | ||
|
|
c431e2f1c5 | ||
|
|
4e5724d9c3 | ||
|
|
0d3e499059 | ||
|
|
7b860b837c | ||
|
|
41fc96e20f | ||
|
|
fb2b1ce57b | ||
|
|
464717451b | ||
|
|
c6ed86d3d0 | ||
|
|
f0a9017008 | ||
|
|
bb7949ba00 | ||
|
|
1df0f69664 | ||
|
|
970066a914 | ||
|
|
1ebd3897c0 | ||
|
|
6460beffcd | ||
|
|
6f7f8958db | ||
|
|
936a00e077 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
12
.github/actions/neon-project-create/action.yml
vendored
12
.github/actions/neon-project-create/action.yml
vendored
@@ -14,11 +14,8 @@ inputs:
|
|||||||
api_host:
|
api_host:
|
||||||
description: 'Neon API host'
|
description: 'Neon API host'
|
||||||
default: console-stage.neon.build
|
default: console-stage.neon.build
|
||||||
provisioner:
|
|
||||||
description: 'k8s-pod or k8s-neonvm'
|
|
||||||
default: 'k8s-pod'
|
|
||||||
compute_units:
|
compute_units:
|
||||||
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
description: '[Min, Max] compute units'
|
||||||
default: '[1, 1]'
|
default: '[1, 1]'
|
||||||
|
|
||||||
outputs:
|
outputs:
|
||||||
@@ -37,10 +34,6 @@ runs:
|
|||||||
# A shell without `set -x` to not to expose password/dsn in logs
|
# A shell without `set -x` to not to expose password/dsn in logs
|
||||||
shell: bash -euo pipefail {0}
|
shell: bash -euo pipefail {0}
|
||||||
run: |
|
run: |
|
||||||
if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
|
|
||||||
echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
|
|
||||||
fi
|
|
||||||
|
|
||||||
project=$(curl \
|
project=$(curl \
|
||||||
"https://${API_HOST}/api/v2/projects" \
|
"https://${API_HOST}/api/v2/projects" \
|
||||||
--fail \
|
--fail \
|
||||||
@@ -52,7 +45,7 @@ runs:
|
|||||||
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
|
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
|
||||||
\"pg_version\": ${POSTGRES_VERSION},
|
\"pg_version\": ${POSTGRES_VERSION},
|
||||||
\"region_id\": \"${REGION_ID}\",
|
\"region_id\": \"${REGION_ID}\",
|
||||||
\"provisioner\": \"${PROVISIONER}\",
|
\"provisioner\": \"k8s-neonvm\",
|
||||||
\"autoscaling_limit_min_cu\": ${MIN_CU},
|
\"autoscaling_limit_min_cu\": ${MIN_CU},
|
||||||
\"autoscaling_limit_max_cu\": ${MAX_CU},
|
\"autoscaling_limit_max_cu\": ${MAX_CU},
|
||||||
\"settings\": { }
|
\"settings\": { }
|
||||||
@@ -75,6 +68,5 @@ runs:
|
|||||||
API_KEY: ${{ inputs.api_key }}
|
API_KEY: ${{ inputs.api_key }}
|
||||||
REGION_ID: ${{ inputs.region_id }}
|
REGION_ID: ${{ inputs.region_id }}
|
||||||
POSTGRES_VERSION: ${{ inputs.postgres_version }}
|
POSTGRES_VERSION: ${{ inputs.postgres_version }}
|
||||||
PROVISIONER: ${{ inputs.provisioner }}
|
|
||||||
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
|
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
|
||||||
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
|
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
|
||||||
|
|||||||
@@ -19,6 +19,10 @@ on:
|
|||||||
description: 'debug or release'
|
description: 'debug or release'
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
|
pg-versions:
|
||||||
|
description: 'a json array of postgres versions to run regression tests on'
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
@@ -254,7 +258,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
pg_version: [ v14, v15, v16 ]
|
pg_version: ${{ fromJson(inputs.pg-versions) }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
@@ -284,5 +288,5 @@ jobs:
|
|||||||
- name: Merge and upload coverage data
|
- name: Merge and upload coverage data
|
||||||
if: |
|
if: |
|
||||||
false &&
|
false &&
|
||||||
inputs.build-type == 'debug' && matrix.pg_version == 'v14'
|
inputs.build-type == 'debug' && matrix.pg_version == 'v16'
|
||||||
uses: ./.github/actions/save-coverage-data
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|||||||
52
.github/workflows/benchmarking.yml
vendored
52
.github/workflows/benchmarking.yml
vendored
@@ -63,11 +63,9 @@ jobs:
|
|||||||
- DEFAULT_PG_VERSION: 16
|
- DEFAULT_PG_VERSION: 16
|
||||||
PLATFORM: "neon-staging"
|
PLATFORM: "neon-staging"
|
||||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||||
provisioner: 'k8s-pod'
|
|
||||||
- DEFAULT_PG_VERSION: 16
|
- DEFAULT_PG_VERSION: 16
|
||||||
PLATFORM: "azure-staging"
|
PLATFORM: "azure-staging"
|
||||||
region_id: 'azure-eastus2'
|
region_id: 'azure-eastus2'
|
||||||
provisioner: 'k8s-neonvm'
|
|
||||||
env:
|
env:
|
||||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||||
@@ -100,7 +98,6 @@ jobs:
|
|||||||
region_id: ${{ matrix.region_id }}
|
region_id: ${{ matrix.region_id }}
|
||||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
provisioner: ${{ matrix.provisioner }}
|
|
||||||
|
|
||||||
- name: Run benchmark
|
- name: Run benchmark
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -216,11 +213,11 @@ jobs:
|
|||||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||||
#
|
#
|
||||||
# Available platforms:
|
# Available platforms:
|
||||||
# - neon-captest-new: Freshly created project (1 CU)
|
# - neonvm-captest-new: Freshly created project (1 CU)
|
||||||
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
|
# - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
|
||||||
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
|
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
|
||||||
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
|
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
|
||||||
# - neon-captest-reuse: Reusing existing project
|
# - neonvm-captest-reuse: Reusing existing project
|
||||||
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||||
env:
|
env:
|
||||||
@@ -245,18 +242,16 @@ jobs:
|
|||||||
"'"$region_id_default"'"
|
"'"$region_id_default"'"
|
||||||
],
|
],
|
||||||
"platform": [
|
"platform": [
|
||||||
"neon-captest-new",
|
"neonvm-captest-new",
|
||||||
"neon-captest-reuse",
|
"neonvm-captest-reuse",
|
||||||
"neonvm-captest-new"
|
"neonvm-captest-new"
|
||||||
],
|
],
|
||||||
"db_size": [ "10gb" ],
|
"db_size": [ "10gb" ],
|
||||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" },
|
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" },
|
|
||||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
|
||||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
|
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
|
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
|
||||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
|
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
|
||||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
|
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
|
||||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||||
}'
|
}'
|
||||||
|
|
||||||
@@ -271,7 +266,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
matrix='{
|
matrix='{
|
||||||
"platform": [
|
"platform": [
|
||||||
"neon-captest-reuse"
|
"neonvm-captest-reuse"
|
||||||
]
|
]
|
||||||
}'
|
}'
|
||||||
|
|
||||||
@@ -287,7 +282,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
matrix='{
|
matrix='{
|
||||||
"platform": [
|
"platform": [
|
||||||
"neon-captest-reuse"
|
"neonvm-captest-reuse"
|
||||||
],
|
],
|
||||||
"scale": [
|
"scale": [
|
||||||
"10"
|
"10"
|
||||||
@@ -338,7 +333,7 @@ jobs:
|
|||||||
prefix: latest
|
prefix: latest
|
||||||
|
|
||||||
- name: Create Neon Project
|
- name: Create Neon Project
|
||||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
|
if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
|
||||||
id: create-neon-project
|
id: create-neon-project
|
||||||
uses: ./.github/actions/neon-project-create
|
uses: ./.github/actions/neon-project-create
|
||||||
with:
|
with:
|
||||||
@@ -346,19 +341,18 @@ jobs:
|
|||||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
|
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
|
||||||
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
|
|
||||||
|
|
||||||
- name: Set up Connection String
|
- name: Set up Connection String
|
||||||
id: set-up-connstr
|
id: set-up-connstr
|
||||||
run: |
|
run: |
|
||||||
case "${PLATFORM}" in
|
case "${PLATFORM}" in
|
||||||
neon-captest-reuse)
|
neonvm-captest-reuse)
|
||||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||||
;;
|
;;
|
||||||
neonvm-captest-sharding-reuse)
|
neonvm-captest-sharding-reuse)
|
||||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
||||||
;;
|
;;
|
||||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
|
neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
|
||||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||||
;;
|
;;
|
||||||
rds-aurora)
|
rds-aurora)
|
||||||
@@ -442,9 +436,9 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- PLATFORM: "neon-captest-pgvector"
|
- PLATFORM: "neonvm-captest-pgvector"
|
||||||
- PLATFORM: "azure-captest-pgvector"
|
- PLATFORM: "azure-captest-pgvector"
|
||||||
|
|
||||||
env:
|
env:
|
||||||
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
||||||
TEST_PG_BENCH_SCALES_MATRIX: "1"
|
TEST_PG_BENCH_SCALES_MATRIX: "1"
|
||||||
@@ -486,7 +480,7 @@ jobs:
|
|||||||
id: set-up-connstr
|
id: set-up-connstr
|
||||||
run: |
|
run: |
|
||||||
case "${PLATFORM}" in
|
case "${PLATFORM}" in
|
||||||
neon-captest-pgvector)
|
neonvm-captest-pgvector)
|
||||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||||
;;
|
;;
|
||||||
azure-captest-pgvector)
|
azure-captest-pgvector)
|
||||||
@@ -585,7 +579,7 @@ jobs:
|
|||||||
id: set-up-connstr
|
id: set-up-connstr
|
||||||
run: |
|
run: |
|
||||||
case "${PLATFORM}" in
|
case "${PLATFORM}" in
|
||||||
neon-captest-reuse)
|
neonvm-captest-reuse)
|
||||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
|
||||||
;;
|
;;
|
||||||
rds-aurora)
|
rds-aurora)
|
||||||
@@ -595,7 +589,7 @@ jobs:
|
|||||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
@@ -672,7 +666,7 @@ jobs:
|
|||||||
- name: Get Connstring Secret Name
|
- name: Get Connstring Secret Name
|
||||||
run: |
|
run: |
|
||||||
case "${PLATFORM}" in
|
case "${PLATFORM}" in
|
||||||
neon-captest-reuse)
|
neonvm-captest-reuse)
|
||||||
ENV_PLATFORM=CAPTEST_TPCH
|
ENV_PLATFORM=CAPTEST_TPCH
|
||||||
;;
|
;;
|
||||||
rds-aurora)
|
rds-aurora)
|
||||||
@@ -682,7 +676,7 @@ jobs:
|
|||||||
ENV_PLATFORM=RDS_AURORA_TPCH
|
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
@@ -759,7 +753,7 @@ jobs:
|
|||||||
id: set-up-connstr
|
id: set-up-connstr
|
||||||
run: |
|
run: |
|
||||||
case "${PLATFORM}" in
|
case "${PLATFORM}" in
|
||||||
neon-captest-reuse)
|
neonvm-captest-reuse)
|
||||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
|
||||||
;;
|
;;
|
||||||
rds-aurora)
|
rds-aurora)
|
||||||
@@ -769,7 +763,7 @@ jobs:
|
|||||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|||||||
32
.github/workflows/build_and_test.yml
vendored
32
.github/workflows/build_and_test.yml
vendored
@@ -203,7 +203,8 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
arch: [ x64 ]
|
arch: [ x64 ]
|
||||||
build-type: [ debug, release ]
|
# Do not build or run tests in debug for release branches
|
||||||
|
build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
|
||||||
include:
|
include:
|
||||||
- build-type: release
|
- build-type: release
|
||||||
arch: arm64
|
arch: arm64
|
||||||
@@ -213,6 +214,8 @@ jobs:
|
|||||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
|
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||||
build-type: ${{ matrix.build-type }}
|
build-type: ${{ matrix.build-type }}
|
||||||
|
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
|
||||||
|
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
|
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
|
||||||
@@ -306,7 +309,7 @@ jobs:
|
|||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
create-test-report:
|
create-test-report:
|
||||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
|
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
|
||||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||||
outputs:
|
outputs:
|
||||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||||
@@ -833,6 +836,9 @@ jobs:
|
|||||||
rm -rf .docker-custom
|
rm -rf .docker-custom
|
||||||
|
|
||||||
promote-images:
|
promote-images:
|
||||||
|
permissions:
|
||||||
|
contents: read # This is required for actions/checkout
|
||||||
|
id-token: write # This is required for Azure Login to work.
|
||||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
@@ -859,6 +865,28 @@ jobs:
|
|||||||
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||||
done
|
done
|
||||||
|
|
||||||
|
- name: Azure login
|
||||||
|
if: github.ref_name == 'main'
|
||||||
|
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||||
|
with:
|
||||||
|
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
||||||
|
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||||
|
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||||
|
|
||||||
|
- name: Login to ACR
|
||||||
|
if: github.ref_name == 'main'
|
||||||
|
run: |
|
||||||
|
az acr login --name=neoneastus2
|
||||||
|
|
||||||
|
- name: Copy docker images to ACR-dev
|
||||||
|
if: github.ref_name == 'main'
|
||||||
|
run: |
|
||||||
|
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
|
||||||
|
docker buildx imagetools create \
|
||||||
|
-t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
|
||||||
|
neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
|
||||||
|
done
|
||||||
|
|
||||||
- name: Add latest tag to images
|
- name: Add latest tag to images
|
||||||
if: github.ref_name == 'main'
|
if: github.ref_name == 'main'
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
72
.github/workflows/pg-clients.yml
vendored
72
.github/workflows/pg-clients.yml
vendored
@@ -13,6 +13,7 @@ on:
|
|||||||
paths:
|
paths:
|
||||||
- '.github/workflows/pg-clients.yml'
|
- '.github/workflows/pg-clients.yml'
|
||||||
- 'test_runner/pg_clients/**'
|
- 'test_runner/pg_clients/**'
|
||||||
|
- 'test_runner/logical_repl/**'
|
||||||
- 'poetry.lock'
|
- 'poetry.lock'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
@@ -49,6 +50,77 @@ jobs:
|
|||||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
|
test-logical-replication:
|
||||||
|
needs: [ build-build-tools-image ]
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
container:
|
||||||
|
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||||
|
credentials:
|
||||||
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
options: --init --user root
|
||||||
|
services:
|
||||||
|
clickhouse:
|
||||||
|
image: clickhouse/clickhouse-server:24.6.3.64
|
||||||
|
ports:
|
||||||
|
- 9000:9000
|
||||||
|
- 8123:8123
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Download Neon artifact
|
||||||
|
uses: ./.github/actions/download
|
||||||
|
with:
|
||||||
|
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||||
|
path: /tmp/neon/
|
||||||
|
prefix: latest
|
||||||
|
|
||||||
|
- name: Create Neon Project
|
||||||
|
id: create-neon-project
|
||||||
|
uses: ./.github/actions/neon-project-create
|
||||||
|
with:
|
||||||
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: remote
|
||||||
|
test_selection: logical_repl
|
||||||
|
run_in_parallel: false
|
||||||
|
extra_params: -m remote_cluster
|
||||||
|
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||||
|
env:
|
||||||
|
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||||
|
|
||||||
|
- name: Delete Neon Project
|
||||||
|
if: always()
|
||||||
|
uses: ./.github/actions/neon-project-delete
|
||||||
|
with:
|
||||||
|
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||||
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
|
||||||
|
- name: Create Allure report
|
||||||
|
if: ${{ !cancelled() }}
|
||||||
|
id: create-allure-report
|
||||||
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
with:
|
||||||
|
store-test-results-into-db: true
|
||||||
|
env:
|
||||||
|
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||||
|
|
||||||
|
- name: Post to a Slack channel
|
||||||
|
if: github.event.schedule && failure()
|
||||||
|
uses: slackapi/slack-github-action@v1
|
||||||
|
with:
|
||||||
|
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||||
|
slack-message: |
|
||||||
|
Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
|
||||||
|
env:
|
||||||
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
test-postgres-client-libs:
|
test-postgres-client-libs:
|
||||||
needs: [ build-build-tools-image ]
|
needs: [ build-build-tools-image ]
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|||||||
188
Cargo.lock
generated
188
Cargo.lock
generated
@@ -1418,7 +1418,7 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"criterion-plot",
|
"criterion-plot",
|
||||||
"is-terminal",
|
"is-terminal",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"num-traits",
|
"num-traits",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"oorandom",
|
"oorandom",
|
||||||
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
|
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cast",
|
"cast",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1672,6 +1672,7 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.4.1",
|
"bitflags 2.4.1",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
|
"chrono",
|
||||||
"diesel_derives",
|
"diesel_derives",
|
||||||
"itoa",
|
"itoa",
|
||||||
"pq-sys",
|
"pq-sys",
|
||||||
@@ -2133,6 +2134,12 @@ dependencies = [
|
|||||||
"slab",
|
"slab",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "gen_ops"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "generic-array"
|
name = "generic-array"
|
||||||
version = "0.14.7"
|
version = "0.14.7"
|
||||||
@@ -2709,17 +2716,6 @@ version = "3.0.4"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
|
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "io-lifetimes"
|
|
||||||
version = "1.0.11"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
|
|
||||||
dependencies = [
|
|
||||||
"hermit-abi",
|
|
||||||
"libc",
|
|
||||||
"windows-sys 0.48.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "io-uring"
|
name = "io-uring"
|
||||||
version = "0.6.2"
|
version = "0.6.2"
|
||||||
@@ -2738,14 +2734,13 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "is-terminal"
|
name = "is-terminal"
|
||||||
version = "0.4.7"
|
version = "0.4.12"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
|
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"hermit-abi",
|
"hermit-abi",
|
||||||
"io-lifetimes",
|
"libc",
|
||||||
"rustix 0.37.25",
|
"windows-sys 0.52.0",
|
||||||
"windows-sys 0.48.0",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2757,6 +2752,15 @@ dependencies = [
|
|||||||
"either",
|
"either",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itertools"
|
||||||
|
version = "0.12.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
||||||
|
dependencies = [
|
||||||
|
"either",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itoa"
|
name = "itoa"
|
||||||
version = "1.0.6"
|
version = "1.0.6"
|
||||||
@@ -2871,18 +2875,6 @@ version = "0.2.8"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "linux-raw-sys"
|
|
||||||
version = "0.1.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "linux-raw-sys"
|
|
||||||
version = "0.3.8"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "linux-raw-sys"
|
name = "linux-raw-sys"
|
||||||
version = "0.4.13"
|
version = "0.4.13"
|
||||||
@@ -3000,7 +2992,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"measured",
|
"measured",
|
||||||
"procfs 0.16.0",
|
"procfs",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3045,7 +3037,7 @@ dependencies = [
|
|||||||
"measured",
|
"measured",
|
||||||
"measured-process",
|
"measured-process",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"procfs 0.14.2",
|
"procfs",
|
||||||
"prometheus",
|
"prometheus",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"rand_distr",
|
"rand_distr",
|
||||||
@@ -3574,7 +3566,7 @@ dependencies = [
|
|||||||
"humantime",
|
"humantime",
|
||||||
"humantime-serde",
|
"humantime-serde",
|
||||||
"hyper 0.14.26",
|
"hyper 0.14.26",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"leaky-bucket",
|
"leaky-bucket",
|
||||||
"md5",
|
"md5",
|
||||||
"metrics",
|
"metrics",
|
||||||
@@ -3592,8 +3584,9 @@ dependencies = [
|
|||||||
"postgres_connection",
|
"postgres_connection",
|
||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
"pq_proto",
|
"pq_proto",
|
||||||
"procfs 0.14.2",
|
"procfs",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
|
"range-set-blaze",
|
||||||
"regex",
|
"regex",
|
||||||
"remote_storage",
|
"remote_storage",
|
||||||
"reqwest 0.12.4",
|
"reqwest 0.12.4",
|
||||||
@@ -3644,7 +3637,7 @@ dependencies = [
|
|||||||
"hex",
|
"hex",
|
||||||
"humantime",
|
"humantime",
|
||||||
"humantime-serde",
|
"humantime-serde",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -3702,7 +3695,7 @@ dependencies = [
|
|||||||
"hex-literal",
|
"hex-literal",
|
||||||
"humantime",
|
"humantime",
|
||||||
"humantime-serde",
|
"humantime-serde",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"metrics",
|
"metrics",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"pageserver_api",
|
"pageserver_api",
|
||||||
@@ -4034,7 +4027,7 @@ name = "postgres_connection"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"postgres",
|
"postgres",
|
||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
@@ -4092,7 +4085,7 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"postgres-protocol",
|
"postgres-protocol",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
@@ -4138,21 +4131,6 @@ dependencies = [
|
|||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "procfs"
|
|
||||||
version = "0.14.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
|
|
||||||
dependencies = [
|
|
||||||
"bitflags 1.3.2",
|
|
||||||
"byteorder",
|
|
||||||
"chrono",
|
|
||||||
"flate2",
|
|
||||||
"hex",
|
|
||||||
"lazy_static",
|
|
||||||
"rustix 0.36.16",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "procfs"
|
name = "procfs"
|
||||||
version = "0.16.0"
|
version = "0.16.0"
|
||||||
@@ -4160,10 +4138,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
|
checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.4.1",
|
"bitflags 2.4.1",
|
||||||
|
"chrono",
|
||||||
|
"flate2",
|
||||||
"hex",
|
"hex",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"procfs-core",
|
"procfs-core",
|
||||||
"rustix 0.38.28",
|
"rustix",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4173,14 +4153,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
|
checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.4.1",
|
"bitflags 2.4.1",
|
||||||
|
"chrono",
|
||||||
"hex",
|
"hex",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "prometheus"
|
name = "prometheus"
|
||||||
version = "0.13.3"
|
version = "0.13.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
|
checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"fnv",
|
"fnv",
|
||||||
@@ -4188,7 +4169,7 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
"memchr",
|
"memchr",
|
||||||
"parking_lot 0.12.1",
|
"parking_lot 0.12.1",
|
||||||
"procfs 0.14.2",
|
"procfs",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -4210,7 +4191,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"heck 0.4.1",
|
"heck 0.4.1",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"log",
|
"log",
|
||||||
"multimap",
|
"multimap",
|
||||||
@@ -4231,7 +4212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
|
checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 1.0.109",
|
"syn 1.0.109",
|
||||||
@@ -4288,7 +4269,7 @@ dependencies = [
|
|||||||
"hyper-util",
|
"hyper-util",
|
||||||
"indexmap 2.0.1",
|
"indexmap 2.0.1",
|
||||||
"ipnet",
|
"ipnet",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"lasso",
|
"lasso",
|
||||||
"md5",
|
"md5",
|
||||||
"measured",
|
"measured",
|
||||||
@@ -4464,6 +4445,18 @@ dependencies = [
|
|||||||
"rand_core 0.5.1",
|
"rand_core 0.5.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "range-set-blaze"
|
||||||
|
version = "0.1.16"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
|
||||||
|
dependencies = [
|
||||||
|
"gen_ops",
|
||||||
|
"itertools 0.12.1",
|
||||||
|
"num-integer",
|
||||||
|
"num-traits",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rayon"
|
name = "rayon"
|
||||||
version = "1.7.0"
|
version = "1.7.0"
|
||||||
@@ -4632,7 +4625,7 @@ dependencies = [
|
|||||||
"humantime",
|
"humantime",
|
||||||
"humantime-serde",
|
"humantime-serde",
|
||||||
"hyper 0.14.26",
|
"hyper 0.14.26",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"metrics",
|
"metrics",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
@@ -4942,34 +4935,6 @@ dependencies = [
|
|||||||
"nom",
|
"nom",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rustix"
|
|
||||||
version = "0.36.16"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
|
|
||||||
dependencies = [
|
|
||||||
"bitflags 1.3.2",
|
|
||||||
"errno",
|
|
||||||
"io-lifetimes",
|
|
||||||
"libc",
|
|
||||||
"linux-raw-sys 0.1.4",
|
|
||||||
"windows-sys 0.45.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rustix"
|
|
||||||
version = "0.37.25"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
|
|
||||||
dependencies = [
|
|
||||||
"bitflags 1.3.2",
|
|
||||||
"errno",
|
|
||||||
"io-lifetimes",
|
|
||||||
"libc",
|
|
||||||
"linux-raw-sys 0.3.8",
|
|
||||||
"windows-sys 0.48.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "0.38.28"
|
version = "0.38.28"
|
||||||
@@ -5718,6 +5683,7 @@ dependencies = [
|
|||||||
"aws-config",
|
"aws-config",
|
||||||
"bytes",
|
"bytes",
|
||||||
"camino",
|
"camino",
|
||||||
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"control_plane",
|
"control_plane",
|
||||||
"diesel",
|
"diesel",
|
||||||
@@ -5728,7 +5694,7 @@ dependencies = [
|
|||||||
"hex",
|
"hex",
|
||||||
"humantime",
|
"humantime",
|
||||||
"hyper 0.14.26",
|
"hyper 0.14.26",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"lasso",
|
"lasso",
|
||||||
"measured",
|
"measured",
|
||||||
"metrics",
|
"metrics",
|
||||||
@@ -5737,6 +5703,7 @@ dependencies = [
|
|||||||
"pageserver_client",
|
"pageserver_client",
|
||||||
"postgres_connection",
|
"postgres_connection",
|
||||||
"r2d2",
|
"r2d2",
|
||||||
|
"rand 0.8.5",
|
||||||
"reqwest 0.12.4",
|
"reqwest 0.12.4",
|
||||||
"routerify",
|
"routerify",
|
||||||
"scopeguard",
|
"scopeguard",
|
||||||
@@ -5792,9 +5759,10 @@ dependencies = [
|
|||||||
"either",
|
"either",
|
||||||
"futures",
|
"futures",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
|
"git-version",
|
||||||
"hex",
|
"hex",
|
||||||
"humantime",
|
"humantime",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"pageserver",
|
"pageserver",
|
||||||
"pageserver_api",
|
"pageserver_api",
|
||||||
@@ -5971,15 +5939,15 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tempfile"
|
name = "tempfile"
|
||||||
version = "3.5.0"
|
version = "3.9.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
|
checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"fastrand 1.9.0",
|
"fastrand 2.0.0",
|
||||||
"redox_syscall 0.3.5",
|
"redox_syscall 0.4.1",
|
||||||
"rustix 0.37.25",
|
"rustix",
|
||||||
"windows-sys 0.45.0",
|
"windows-sys 0.52.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -7176,15 +7144,6 @@ dependencies = [
|
|||||||
"windows_x86_64_msvc 0.42.2",
|
"windows_x86_64_msvc 0.42.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "windows-sys"
|
|
||||||
version = "0.45.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
|
|
||||||
dependencies = [
|
|
||||||
"windows-targets 0.42.2",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-sys"
|
name = "windows-sys"
|
||||||
version = "0.48.0"
|
version = "0.48.0"
|
||||||
@@ -7203,21 +7162,6 @@ dependencies = [
|
|||||||
"windows-targets 0.52.4",
|
"windows-targets 0.52.4",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "windows-targets"
|
|
||||||
version = "0.42.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
|
|
||||||
dependencies = [
|
|
||||||
"windows_aarch64_gnullvm 0.42.2",
|
|
||||||
"windows_aarch64_msvc 0.42.2",
|
|
||||||
"windows_i686_gnu 0.42.2",
|
|
||||||
"windows_i686_msvc 0.42.2",
|
|
||||||
"windows_x86_64_gnu 0.42.2",
|
|
||||||
"windows_x86_64_gnullvm 0.42.2",
|
|
||||||
"windows_x86_64_msvc 0.42.2",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-targets"
|
name = "windows-targets"
|
||||||
version = "0.48.0"
|
version = "0.48.0"
|
||||||
@@ -7447,7 +7391,7 @@ dependencies = [
|
|||||||
"hmac",
|
"hmac",
|
||||||
"hyper 0.14.26",
|
"hyper 0.14.26",
|
||||||
"indexmap 1.9.3",
|
"indexmap 1.9.3",
|
||||||
"itertools",
|
"itertools 0.10.5",
|
||||||
"libc",
|
"libc",
|
||||||
"log",
|
"log",
|
||||||
"memchr",
|
"memchr",
|
||||||
|
|||||||
@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
|||||||
parquet_derive = "51.0.0"
|
parquet_derive = "51.0.0"
|
||||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||||
pin-project-lite = "0.2"
|
pin-project-lite = "0.2"
|
||||||
procfs = "0.14"
|
procfs = "0.16"
|
||||||
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
|
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
|
||||||
prost = "0.11"
|
prost = "0.11"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
|
|||||||
@@ -4,6 +4,11 @@ version = "0.1.0"
|
|||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = []
|
||||||
|
# Enables test specific features.
|
||||||
|
testing = []
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
async-compression.workspace = true
|
async-compression.workspace = true
|
||||||
|
|||||||
@@ -400,7 +400,15 @@ impl ComputeNode {
|
|||||||
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||||
let mut retry_period_ms = 500.0;
|
let mut retry_period_ms = 500.0;
|
||||||
let mut attempts = 0;
|
let mut attempts = 0;
|
||||||
let max_attempts = 10;
|
const DEFAULT_ATTEMPTS: u16 = 10;
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
|
||||||
|
u16::from_str(&v).unwrap()
|
||||||
|
} else {
|
||||||
|
DEFAULT_ATTEMPTS
|
||||||
|
};
|
||||||
|
#[cfg(not(feature = "testing"))]
|
||||||
|
let max_attempts = DEFAULT_ATTEMPTS;
|
||||||
loop {
|
loop {
|
||||||
let result = self.try_get_basebackup(compute_state, lsn);
|
let result = self.try_get_basebackup(compute_state, lsn);
|
||||||
match result {
|
match result {
|
||||||
|
|||||||
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
|||||||
|
|
||||||
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
|
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
|
||||||
for (var, val) in std::env::vars() {
|
for (var, val) in std::env::vars() {
|
||||||
if var.starts_with("NEON_PAGESERVER_") {
|
if var.starts_with("NEON_") {
|
||||||
cmd = cmd.env(var, val);
|
cmd = cmd.env(var, val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -514,7 +514,6 @@ impl LocalEnv {
|
|||||||
#[derive(serde::Serialize, serde::Deserialize)]
|
#[derive(serde::Serialize, serde::Deserialize)]
|
||||||
// (allow unknown fields, unlike PageServerConf)
|
// (allow unknown fields, unlike PageServerConf)
|
||||||
struct PageserverConfigTomlSubset {
|
struct PageserverConfigTomlSubset {
|
||||||
id: NodeId,
|
|
||||||
listen_pg_addr: String,
|
listen_pg_addr: String,
|
||||||
listen_http_addr: String,
|
listen_http_addr: String,
|
||||||
pg_auth_type: AuthType,
|
pg_auth_type: AuthType,
|
||||||
@@ -526,18 +525,30 @@ impl LocalEnv {
|
|||||||
.with_context(|| format!("read {:?}", config_toml_path))?,
|
.with_context(|| format!("read {:?}", config_toml_path))?,
|
||||||
)
|
)
|
||||||
.context("parse pageserver.toml")?;
|
.context("parse pageserver.toml")?;
|
||||||
|
let identity_toml_path = dentry.path().join("identity.toml");
|
||||||
|
#[derive(serde::Serialize, serde::Deserialize)]
|
||||||
|
struct IdentityTomlSubset {
|
||||||
|
id: NodeId,
|
||||||
|
}
|
||||||
|
let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
|
||||||
|
&std::fs::read_to_string(&identity_toml_path)
|
||||||
|
.with_context(|| format!("read {:?}", identity_toml_path))?,
|
||||||
|
)
|
||||||
|
.context("parse identity.toml")?;
|
||||||
let PageserverConfigTomlSubset {
|
let PageserverConfigTomlSubset {
|
||||||
id: config_toml_id,
|
|
||||||
listen_pg_addr,
|
listen_pg_addr,
|
||||||
listen_http_addr,
|
listen_http_addr,
|
||||||
pg_auth_type,
|
pg_auth_type,
|
||||||
http_auth_type,
|
http_auth_type,
|
||||||
} = config_toml;
|
} = config_toml;
|
||||||
|
let IdentityTomlSubset {
|
||||||
|
id: identity_toml_id,
|
||||||
|
} = identity_toml;
|
||||||
let conf = PageServerConf {
|
let conf = PageServerConf {
|
||||||
id: {
|
id: {
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
config_toml_id == id,
|
identity_toml_id == id,
|
||||||
"id mismatch: config_toml.id={config_toml_id} id={id}",
|
"id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
|
||||||
);
|
);
|
||||||
id
|
id
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -127,10 +127,13 @@ impl PageServerNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Apply the user-provided overrides
|
// Apply the user-provided overrides
|
||||||
overrides.push(
|
overrides.push({
|
||||||
toml_edit::ser::to_string_pretty(&conf)
|
let mut doc =
|
||||||
.expect("we deserialized this from toml earlier"),
|
toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
|
||||||
);
|
// `id` is written out to `identity.toml` instead of `pageserver.toml`
|
||||||
|
doc.remove("id").expect("it's part of the struct");
|
||||||
|
doc.to_string()
|
||||||
|
});
|
||||||
|
|
||||||
// Turn `overrides` into a toml document.
|
// Turn `overrides` into a toml document.
|
||||||
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::time::Instant;
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
/// Request/response types for the storage controller
|
/// Request/response types for the storage controller
|
||||||
/// API (`/control/v1` prefix). Implemented by the server
|
/// API (`/control/v1` prefix). Implemented by the server
|
||||||
@@ -294,6 +295,42 @@ pub enum PlacementPolicy {
|
|||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct TenantShardMigrateResponse {}
|
pub struct TenantShardMigrateResponse {}
|
||||||
|
|
||||||
|
/// Metadata health record posted from scrubber.
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
pub struct MetadataHealthRecord {
|
||||||
|
pub tenant_shard_id: TenantShardId,
|
||||||
|
pub healthy: bool,
|
||||||
|
pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
pub struct MetadataHealthUpdateRequest {
|
||||||
|
pub healthy_tenant_shards: HashSet<TenantShardId>,
|
||||||
|
pub unhealthy_tenant_shards: HashSet<TenantShardId>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
pub struct MetadataHealthUpdateResponse {}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
|
||||||
|
pub struct MetadataHealthListUnhealthyResponse {
|
||||||
|
pub unhealthy_tenant_shards: Vec<TenantShardId>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
|
||||||
|
pub struct MetadataHealthListOutdatedRequest {
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub not_scrubbed_for: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
|
||||||
|
pub struct MetadataHealthListOutdatedResponse {
|
||||||
|
pub health_records: Vec<MetadataHealthRecord>,
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|||||||
@@ -355,7 +355,8 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
.blobs()
|
.blobs()
|
||||||
.map(|k| ListingObject{
|
.map(|k| ListingObject{
|
||||||
key: self.name_to_relative_path(&k.name),
|
key: self.name_to_relative_path(&k.name),
|
||||||
last_modified: k.properties.last_modified.into()
|
last_modified: k.properties.last_modified.into(),
|
||||||
|
size: k.properties.content_length,
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -144,6 +144,7 @@ impl RemotePath {
|
|||||||
///
|
///
|
||||||
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
|
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
|
||||||
/// NoDelimiter mode will only populate `keys`.
|
/// NoDelimiter mode will only populate `keys`.
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
pub enum ListingMode {
|
pub enum ListingMode {
|
||||||
WithDelimiter,
|
WithDelimiter,
|
||||||
NoDelimiter,
|
NoDelimiter,
|
||||||
@@ -153,6 +154,7 @@ pub enum ListingMode {
|
|||||||
pub struct ListingObject {
|
pub struct ListingObject {
|
||||||
pub key: RemotePath,
|
pub key: RemotePath,
|
||||||
pub last_modified: SystemTime,
|
pub last_modified: SystemTime,
|
||||||
|
pub size: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@@ -194,7 +196,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
|||||||
mode: ListingMode,
|
mode: ListingMode,
|
||||||
max_keys: Option<NonZeroU32>,
|
max_keys: Option<NonZeroU32>,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> impl Stream<Item = Result<Listing, DownloadError>>;
|
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
|
||||||
|
|
||||||
async fn list(
|
async fn list(
|
||||||
&self,
|
&self,
|
||||||
@@ -351,10 +353,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
|||||||
mode: ListingMode,
|
mode: ListingMode,
|
||||||
max_keys: Option<NonZeroU32>,
|
max_keys: Option<NonZeroU32>,
|
||||||
cancel: &'a CancellationToken,
|
cancel: &'a CancellationToken,
|
||||||
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
|
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
|
||||||
match self {
|
match self {
|
||||||
Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
|
Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
|
||||||
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
|
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
|
||||||
Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||||
Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||||
Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
|
||||||
|
|||||||
@@ -368,6 +368,7 @@ impl RemoteStorage for LocalFs {
|
|||||||
key: k.clone(),
|
key: k.clone(),
|
||||||
// LocalFs is just for testing, so just specify a dummy time
|
// LocalFs is just for testing, so just specify a dummy time
|
||||||
last_modified: SystemTime::now(),
|
last_modified: SystemTime::now(),
|
||||||
|
size: 0,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@@ -411,6 +412,7 @@ impl RemoteStorage for LocalFs {
|
|||||||
key: RemotePath::from_string(&relative_key).unwrap(),
|
key: RemotePath::from_string(&relative_key).unwrap(),
|
||||||
// LocalFs is just for testing
|
// LocalFs is just for testing
|
||||||
last_modified: SystemTime::now(),
|
last_modified: SystemTime::now(),
|
||||||
|
size: 0,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -565,9 +565,12 @@ impl RemoteStorage for S3Bucket {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let size = object.size.unwrap_or(0) as u64;
|
||||||
|
|
||||||
result.keys.push(ListingObject{
|
result.keys.push(ListingObject{
|
||||||
key,
|
key,
|
||||||
last_modified
|
last_modified,
|
||||||
|
size,
|
||||||
});
|
});
|
||||||
if let Some(mut mk) = max_keys {
|
if let Some(mut mk) = max_keys {
|
||||||
assert!(mk > 0);
|
assert!(mk > 0);
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
|
|||||||
mode: ListingMode,
|
mode: ListingMode,
|
||||||
max_keys: Option<NonZeroU32>,
|
max_keys: Option<NonZeroU32>,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> impl Stream<Item = Result<Listing, DownloadError>> {
|
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
|
||||||
async_stream::stream! {
|
async_stream::stream! {
|
||||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
|||||||
@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
|||||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
|
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
|
||||||
#[serde(rename_all = "lowercase")]
|
#[serde(rename_all = "lowercase")]
|
||||||
pub enum Scope {
|
pub enum Scope {
|
||||||
// Provides access to all data for a specific tenant (specified in `struct Claims` below)
|
/// Provides access to all data for a specific tenant (specified in `struct Claims` below)
|
||||||
// TODO: join these two?
|
// TODO: join these two?
|
||||||
Tenant,
|
Tenant,
|
||||||
// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
|
/// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
|
||||||
// Should only be used e.g. for status check/tenant creation/list.
|
/// Should only be used e.g. for status check/tenant creation/list.
|
||||||
PageServerApi,
|
PageServerApi,
|
||||||
// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
|
/// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
|
||||||
// Should only be used e.g. for status check.
|
/// Should only be used e.g. for status check.
|
||||||
// Currently also used for connection from any pageserver to any safekeeper.
|
/// Currently also used for connection from any pageserver to any safekeeper.
|
||||||
SafekeeperData,
|
SafekeeperData,
|
||||||
// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
/// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
||||||
#[serde(rename = "generations_api")]
|
#[serde(rename = "generations_api")]
|
||||||
GenerationsApi,
|
GenerationsApi,
|
||||||
// Allows access to control plane managment API and some storage controller endpoints.
|
/// Allows access to control plane managment API and some storage controller endpoints.
|
||||||
Admin,
|
Admin,
|
||||||
|
|
||||||
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
|
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ postgres_backend.workspace = true
|
|||||||
postgres-protocol.workspace = true
|
postgres-protocol.workspace = true
|
||||||
postgres-types.workspace = true
|
postgres-types.workspace = true
|
||||||
rand.workspace = true
|
rand.workspace = true
|
||||||
|
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
scopeguard.workspace = true
|
scopeguard.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use criterion::measurement::WallTime;
|
||||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||||
use pageserver::repository::Key;
|
use pageserver::repository::Key;
|
||||||
use pageserver::tenant::layer_map::LayerMap;
|
use pageserver::tenant::layer_map::LayerMap;
|
||||||
@@ -15,7 +16,11 @@ use utils::id::{TenantId, TimelineId};
|
|||||||
|
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
|
||||||
|
|
||||||
|
fn fixture_path(relative: &str) -> PathBuf {
|
||||||
|
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
|
||||||
|
}
|
||||||
|
|
||||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
||||||
let mut layer_map = LayerMap::default();
|
let mut layer_map = LayerMap::default();
|
||||||
@@ -109,7 +114,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
|
|||||||
// between each test run.
|
// between each test run.
|
||||||
fn bench_from_captest_env(c: &mut Criterion) {
|
fn bench_from_captest_env(c: &mut Criterion) {
|
||||||
// TODO consider compressing this file
|
// TODO consider compressing this file
|
||||||
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
|
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||||
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
|
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
|
||||||
|
|
||||||
// Test with uniform query pattern
|
// Test with uniform query pattern
|
||||||
@@ -139,7 +144,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
|
|||||||
fn bench_from_real_project(c: &mut Criterion) {
|
fn bench_from_real_project(c: &mut Criterion) {
|
||||||
// Init layer map
|
// Init layer map
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
|
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||||
println!("Finished layer map init in {:?}", now.elapsed());
|
println!("Finished layer map init in {:?}", now.elapsed());
|
||||||
|
|
||||||
// Choose uniformly distributed queries
|
// Choose uniformly distributed queries
|
||||||
@@ -242,7 +247,72 @@ fn bench_sequential(c: &mut Criterion) {
|
|||||||
group.finish();
|
group.finish();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn bench_visibility_with_map(
|
||||||
|
group: &mut BenchmarkGroup<WallTime>,
|
||||||
|
layer_map: LayerMap,
|
||||||
|
read_points: Vec<Lsn>,
|
||||||
|
bench_name: &str,
|
||||||
|
) {
|
||||||
|
group.bench_function(bench_name, |b| {
|
||||||
|
b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
|
||||||
|
fn bench_visibility(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("visibility");
|
||||||
|
{
|
||||||
|
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
|
||||||
|
let now = Instant::now();
|
||||||
|
let mut layer_map = LayerMap::default();
|
||||||
|
let mut updates = layer_map.batch_update();
|
||||||
|
for i in 0..100_000 {
|
||||||
|
let i32 = (i as u32) % 100;
|
||||||
|
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||||
|
let layer = PersistentLayerDesc::new_img(
|
||||||
|
TenantShardId::unsharded(TenantId::generate()),
|
||||||
|
TimelineId::generate(),
|
||||||
|
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||||
|
Lsn(i),
|
||||||
|
0,
|
||||||
|
);
|
||||||
|
updates.insert_historic(layer);
|
||||||
|
}
|
||||||
|
updates.flush();
|
||||||
|
println!("Finished layer map init in {:?}", now.elapsed());
|
||||||
|
|
||||||
|
let mut read_points = Vec::new();
|
||||||
|
for i in (0..100_000).step_by(1000) {
|
||||||
|
read_points.push(Lsn(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||||
|
let read_points = vec![Lsn(0x1C760FA190)];
|
||||||
|
bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
|
||||||
|
|
||||||
|
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||||
|
let read_points = vec![
|
||||||
|
Lsn(0x1C760FA190),
|
||||||
|
Lsn(0x000000931BEAD539),
|
||||||
|
Lsn(0x000000931BF63011),
|
||||||
|
Lsn(0x000000931B33AE68),
|
||||||
|
Lsn(0x00000038E67ABFA0),
|
||||||
|
Lsn(0x000000931B33AE68),
|
||||||
|
Lsn(0x000000914E3F38F0),
|
||||||
|
Lsn(0x000000931B33AE68),
|
||||||
|
];
|
||||||
|
bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
|
||||||
|
}
|
||||||
|
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
criterion_group!(group_1, bench_from_captest_env);
|
criterion_group!(group_1, bench_from_captest_env);
|
||||||
criterion_group!(group_2, bench_from_real_project);
|
criterion_group!(group_2, bench_from_real_project);
|
||||||
criterion_group!(group_3, bench_sequential);
|
criterion_group!(group_3, bench_sequential);
|
||||||
criterion_main!(group_1, group_2, group_3);
|
criterion_group!(group_4, bench_visibility);
|
||||||
|
criterion_main!(group_1, group_2, group_3, group_4);
|
||||||
|
|||||||
@@ -17,11 +17,9 @@ use pageserver::config::PageserverIdentity;
|
|||||||
use pageserver::control_plane_client::ControlPlaneClient;
|
use pageserver::control_plane_client::ControlPlaneClient;
|
||||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
|
||||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||||
use pageserver::{
|
use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
|
||||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
|
|
||||||
};
|
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use tokio::signal::unix::SignalKind;
|
use tokio::signal::unix::SignalKind;
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
@@ -31,11 +29,9 @@ use tracing::*;
|
|||||||
use metrics::set_build_info_metric;
|
use metrics::set_build_info_metric;
|
||||||
use pageserver::{
|
use pageserver::{
|
||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
context::{DownloadBehavior, RequestContext},
|
|
||||||
deletion_queue::DeletionQueue,
|
deletion_queue::DeletionQueue,
|
||||||
http, page_cache, page_service, task_mgr,
|
http, page_cache, page_service, task_mgr,
|
||||||
task_mgr::TaskKind,
|
task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
|
||||||
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
|
|
||||||
tenant::mgr,
|
tenant::mgr,
|
||||||
virtual_file,
|
virtual_file,
|
||||||
};
|
};
|
||||||
@@ -129,6 +125,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||||
info!(?conf.get_impl, "starting with get page implementation");
|
info!(?conf.get_impl, "starting with get page implementation");
|
||||||
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
|
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
|
||||||
|
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
||||||
|
|
||||||
let tenants_path = conf.tenants_path();
|
let tenants_path = conf.tenants_path();
|
||||||
if !tenants_path.exists() {
|
if !tenants_path.exists() {
|
||||||
@@ -593,30 +590,13 @@ fn start_pageserver(
|
|||||||
|
|
||||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||||
// for each connection. We created the listener earlier already.
|
// for each connection. We created the listener earlier already.
|
||||||
let libpq_listener = {
|
let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
|
||||||
let cancel = CancellationToken::new();
|
let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
|
||||||
let libpq_ctx = RequestContext::todo_child(
|
pageserver_listener
|
||||||
TaskKind::LibpqEndpointListener,
|
.set_nonblocking(true)
|
||||||
// listener task shouldn't need to download anything. (We will
|
.context("set listener to nonblocking")?;
|
||||||
// create a separate sub-contexts for each connection, with their
|
tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
|
||||||
// own download behavior. This context is used only to listen and
|
});
|
||||||
// accept connections.)
|
|
||||||
DownloadBehavior::Error,
|
|
||||||
);
|
|
||||||
|
|
||||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
|
||||||
"libpq listener",
|
|
||||||
page_service::libpq_listener_main(
|
|
||||||
tenant_manager.clone(),
|
|
||||||
pg_auth,
|
|
||||||
pageserver_listener,
|
|
||||||
conf.pg_auth_type,
|
|
||||||
libpq_ctx,
|
|
||||||
cancel.clone(),
|
|
||||||
),
|
|
||||||
));
|
|
||||||
LibpqEndpointListener(CancellableTask { task, cancel })
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||||
|
|
||||||
@@ -644,7 +624,7 @@ fn start_pageserver(
|
|||||||
shutdown_pageserver.take();
|
shutdown_pageserver.take();
|
||||||
pageserver::shutdown_pageserver(
|
pageserver::shutdown_pageserver(
|
||||||
http_endpoint_listener,
|
http_endpoint_listener,
|
||||||
libpq_listener,
|
page_service,
|
||||||
consumption_metrics_tasks,
|
consumption_metrics_tasks,
|
||||||
disk_usage_eviction_task,
|
disk_usage_eviction_task,
|
||||||
&tenant_manager,
|
&tenant_manager,
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ use utils::{
|
|||||||
logging::LogFormat,
|
logging::LogFormat,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
|
||||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||||
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
||||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||||
@@ -295,6 +296,10 @@ pub struct PageServerConf {
|
|||||||
pub ephemeral_bytes_per_memory_kb: usize,
|
pub ephemeral_bytes_per_memory_kb: usize,
|
||||||
|
|
||||||
pub l0_flush: L0FlushConfig,
|
pub l0_flush: L0FlushConfig,
|
||||||
|
|
||||||
|
/// This flag is temporary and will be removed after gradual rollout.
|
||||||
|
/// See <https://github.com/neondatabase/neon/issues/8184>.
|
||||||
|
pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||||
@@ -356,8 +361,6 @@ struct PageServerConfigBuilder {
|
|||||||
auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
|
auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
|
||||||
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
|
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
|
||||||
|
|
||||||
id: BuilderValue<NodeId>,
|
|
||||||
|
|
||||||
broker_endpoint: BuilderValue<Uri>,
|
broker_endpoint: BuilderValue<Uri>,
|
||||||
broker_keepalive_interval: BuilderValue<Duration>,
|
broker_keepalive_interval: BuilderValue<Duration>,
|
||||||
|
|
||||||
@@ -403,14 +406,13 @@ struct PageServerConfigBuilder {
|
|||||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||||
|
|
||||||
l0_flush: BuilderValue<L0FlushConfig>,
|
l0_flush: BuilderValue<L0FlushConfig>,
|
||||||
|
|
||||||
|
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PageServerConfigBuilder {
|
impl PageServerConfigBuilder {
|
||||||
fn new(node_id: NodeId) -> Self {
|
fn new() -> Self {
|
||||||
let mut this = Self::default();
|
Self::default()
|
||||||
this.id(node_id);
|
|
||||||
|
|
||||||
this
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
@@ -438,7 +440,6 @@ impl PageServerConfigBuilder {
|
|||||||
pg_auth_type: Set(AuthType::Trust),
|
pg_auth_type: Set(AuthType::Trust),
|
||||||
auth_validation_public_key_path: Set(None),
|
auth_validation_public_key_path: Set(None),
|
||||||
remote_storage_config: Set(None),
|
remote_storage_config: Set(None),
|
||||||
id: NotSet,
|
|
||||||
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
|
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
|
||||||
.parse()
|
.parse()
|
||||||
.expect("failed to parse default broker endpoint")),
|
.expect("failed to parse default broker endpoint")),
|
||||||
@@ -496,6 +497,7 @@ impl PageServerConfigBuilder {
|
|||||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||||
l0_flush: Set(L0FlushConfig::default()),
|
l0_flush: Set(L0FlushConfig::default()),
|
||||||
|
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -568,10 +570,6 @@ impl PageServerConfigBuilder {
|
|||||||
self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
|
self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn id(&mut self, node_id: NodeId) {
|
|
||||||
self.id = BuilderValue::Set(node_id)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn log_format(&mut self, log_format: LogFormat) {
|
pub fn log_format(&mut self, log_format: LogFormat) {
|
||||||
self.log_format = BuilderValue::Set(log_format)
|
self.log_format = BuilderValue::Set(log_format)
|
||||||
}
|
}
|
||||||
@@ -683,7 +681,11 @@ impl PageServerConfigBuilder {
|
|||||||
self.l0_flush = BuilderValue::Set(value);
|
self.l0_flush = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
|
||||||
|
self.compact_level0_phase1_value_access = BuilderValue::Set(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
|
||||||
let default = Self::default_values();
|
let default = Self::default_values();
|
||||||
|
|
||||||
macro_rules! conf {
|
macro_rules! conf {
|
||||||
@@ -716,7 +718,6 @@ impl PageServerConfigBuilder {
|
|||||||
pg_auth_type,
|
pg_auth_type,
|
||||||
auth_validation_public_key_path,
|
auth_validation_public_key_path,
|
||||||
remote_storage_config,
|
remote_storage_config,
|
||||||
id,
|
|
||||||
broker_endpoint,
|
broker_endpoint,
|
||||||
broker_keepalive_interval,
|
broker_keepalive_interval,
|
||||||
log_format,
|
log_format,
|
||||||
@@ -741,9 +742,11 @@ impl PageServerConfigBuilder {
|
|||||||
image_compression,
|
image_compression,
|
||||||
ephemeral_bytes_per_memory_kb,
|
ephemeral_bytes_per_memory_kb,
|
||||||
l0_flush,
|
l0_flush,
|
||||||
|
compact_level0_phase1_value_access,
|
||||||
}
|
}
|
||||||
CUSTOM LOGIC
|
CUSTOM LOGIC
|
||||||
{
|
{
|
||||||
|
id: id,
|
||||||
// TenantConf is handled separately
|
// TenantConf is handled separately
|
||||||
default_tenant_conf: TenantConf::default(),
|
default_tenant_conf: TenantConf::default(),
|
||||||
concurrent_tenant_warmup: ConfigurableSemaphore::new({
|
concurrent_tenant_warmup: ConfigurableSemaphore::new({
|
||||||
@@ -893,7 +896,7 @@ impl PageServerConf {
|
|||||||
toml: &Document,
|
toml: &Document,
|
||||||
workdir: &Utf8Path,
|
workdir: &Utf8Path,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
let mut builder = PageServerConfigBuilder::new(node_id);
|
let mut builder = PageServerConfigBuilder::new();
|
||||||
builder.workdir(workdir.to_owned());
|
builder.workdir(workdir.to_owned());
|
||||||
|
|
||||||
let mut t_conf = TenantConfOpt::default();
|
let mut t_conf = TenantConfOpt::default();
|
||||||
@@ -924,8 +927,6 @@ impl PageServerConf {
|
|||||||
"tenant_config" => {
|
"tenant_config" => {
|
||||||
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
|
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
|
||||||
}
|
}
|
||||||
"id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
|
|
||||||
// Logging is not set up yet, so we can't do it.
|
|
||||||
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
||||||
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
|
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
|
||||||
"log_format" => builder.log_format(
|
"log_format" => builder.log_format(
|
||||||
@@ -1014,11 +1015,14 @@ impl PageServerConf {
|
|||||||
"l0_flush" => {
|
"l0_flush" => {
|
||||||
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
|
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
|
||||||
}
|
}
|
||||||
|
"compact_level0_phase1_value_access" => {
|
||||||
|
builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
|
||||||
|
}
|
||||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut conf = builder.build().context("invalid config")?;
|
let mut conf = builder.build(node_id).context("invalid config")?;
|
||||||
|
|
||||||
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
|
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
|
||||||
let auth_validation_public_key_path = conf
|
let auth_validation_public_key_path = conf
|
||||||
@@ -1098,6 +1102,7 @@ impl PageServerConf {
|
|||||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
l0_flush: L0FlushConfig::default(),
|
l0_flush: L0FlushConfig::default(),
|
||||||
|
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1255,7 +1260,6 @@ max_file_descriptors = 333
|
|||||||
|
|
||||||
# initial superuser role name to use when creating a new tenant
|
# initial superuser role name to use when creating a new tenant
|
||||||
initial_superuser_name = 'zzzz'
|
initial_superuser_name = 'zzzz'
|
||||||
id = 10
|
|
||||||
|
|
||||||
metric_collection_interval = '222 s'
|
metric_collection_interval = '222 s'
|
||||||
metric_collection_endpoint = 'http://localhost:80/metrics'
|
metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||||
@@ -1272,9 +1276,8 @@ background_task_maximum_delay = '334 s'
|
|||||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||||
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
|
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
|
||||||
// we have to create dummy values to overcome the validation errors
|
// we have to create dummy values to overcome the validation errors
|
||||||
let config_string = format!(
|
let config_string =
|
||||||
"pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
|
format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
|
||||||
);
|
|
||||||
let toml = config_string.parse()?;
|
let toml = config_string.parse()?;
|
||||||
|
|
||||||
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
|
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
|
||||||
@@ -1341,6 +1344,7 @@ background_task_maximum_delay = '334 s'
|
|||||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
l0_flush: L0FlushConfig::default(),
|
l0_flush: L0FlushConfig::default(),
|
||||||
|
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||||
},
|
},
|
||||||
"Correct defaults should be used when no config values are provided"
|
"Correct defaults should be used when no config values are provided"
|
||||||
);
|
);
|
||||||
@@ -1415,6 +1419,7 @@ background_task_maximum_delay = '334 s'
|
|||||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
l0_flush: L0FlushConfig::default(),
|
l0_flush: L0FlushConfig::default(),
|
||||||
|
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||||
},
|
},
|
||||||
"Should be able to parse all basic config values correctly"
|
"Should be able to parse all basic config values correctly"
|
||||||
);
|
);
|
||||||
@@ -1579,7 +1584,6 @@ broker_endpoint = '{broker_endpoint}'
|
|||||||
r#"pg_distrib_dir = "{pg_distrib_dir}"
|
r#"pg_distrib_dir = "{pg_distrib_dir}"
|
||||||
metric_collection_endpoint = "http://sample.url"
|
metric_collection_endpoint = "http://sample.url"
|
||||||
metric_collection_interval = "10min"
|
metric_collection_interval = "10min"
|
||||||
id = 222
|
|
||||||
|
|
||||||
[disk_usage_based_eviction]
|
[disk_usage_based_eviction]
|
||||||
max_usage_pct = 80
|
max_usage_pct = 80
|
||||||
@@ -1649,7 +1653,6 @@ threshold = "20m"
|
|||||||
r#"pg_distrib_dir = "{pg_distrib_dir}"
|
r#"pg_distrib_dir = "{pg_distrib_dir}"
|
||||||
metric_collection_endpoint = "http://sample.url"
|
metric_collection_endpoint = "http://sample.url"
|
||||||
metric_collection_interval = "10min"
|
metric_collection_interval = "10min"
|
||||||
id = 222
|
|
||||||
|
|
||||||
[tenant_config]
|
[tenant_config]
|
||||||
evictions_low_residence_duration_metric_threshold = "20m"
|
evictions_low_residence_duration_metric_threshold = "20m"
|
||||||
|
|||||||
@@ -296,6 +296,11 @@ impl From<GetActiveTenantError> for ApiError {
|
|||||||
GetActiveTenantError::WaitForActiveTimeout { .. } => {
|
GetActiveTenantError::WaitForActiveTimeout { .. } => {
|
||||||
ApiError::ResourceUnavailable(format!("{}", e).into())
|
ApiError::ResourceUnavailable(format!("{}", e).into())
|
||||||
}
|
}
|
||||||
|
GetActiveTenantError::SwitchedTenant => {
|
||||||
|
// in our HTTP handlers, this error doesn't happen
|
||||||
|
// TODO: separate error types
|
||||||
|
ApiError::ResourceUnavailable("switched tenant".into())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2129,14 +2134,24 @@ async fn secondary_download_handler(
|
|||||||
|
|
||||||
let timeout = wait.unwrap_or(Duration::MAX);
|
let timeout = wait.unwrap_or(Duration::MAX);
|
||||||
|
|
||||||
let status = match tokio::time::timeout(
|
let result = tokio::time::timeout(
|
||||||
timeout,
|
timeout,
|
||||||
state.secondary_controller.download_tenant(tenant_shard_id),
|
state.secondary_controller.download_tenant(tenant_shard_id),
|
||||||
)
|
)
|
||||||
.await
|
.await;
|
||||||
{
|
|
||||||
// Download job ran to completion.
|
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
||||||
Ok(Ok(())) => StatusCode::OK,
|
|
||||||
|
let status = match result {
|
||||||
|
Ok(Ok(())) => {
|
||||||
|
if progress.layers_downloaded >= progress.layers_total {
|
||||||
|
// Download job ran to completion
|
||||||
|
StatusCode::OK
|
||||||
|
} else {
|
||||||
|
// Download dropped out without errors because it ran out of time budget
|
||||||
|
StatusCode::ACCEPTED
|
||||||
|
}
|
||||||
|
}
|
||||||
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
|
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
|
||||||
// okay. We could get an error here in the unlikely edge case that the tenant
|
// okay. We could get an error here in the unlikely edge case that the tenant
|
||||||
// was detached between our check above and executing the download job.
|
// was detached between our check above and executing the download job.
|
||||||
@@ -2146,8 +2161,6 @@ async fn secondary_download_handler(
|
|||||||
Err(_) => StatusCode::ACCEPTED,
|
Err(_) => StatusCode::ACCEPTED,
|
||||||
};
|
};
|
||||||
|
|
||||||
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
|
||||||
|
|
||||||
json_response(status, progress)
|
json_response(status, progress)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,13 +2,23 @@ use std::{num::NonZeroUsize, sync::Arc};
|
|||||||
|
|
||||||
use crate::tenant::ephemeral_file;
|
use crate::tenant::ephemeral_file;
|
||||||
|
|
||||||
#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||||
pub enum L0FlushConfig {
|
pub enum L0FlushConfig {
|
||||||
#[default]
|
|
||||||
PageCached,
|
PageCached,
|
||||||
#[serde(rename_all = "snake_case")]
|
#[serde(rename_all = "snake_case")]
|
||||||
Direct { max_concurrency: NonZeroUsize },
|
Direct {
|
||||||
|
max_concurrency: NonZeroUsize,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for L0FlushConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::Direct {
|
||||||
|
// TODO: using num_cpus results in different peak memory usage on different instance types.
|
||||||
|
max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ pub mod disk_usage_eviction_task;
|
|||||||
pub mod http;
|
pub mod http;
|
||||||
pub mod import_datadir;
|
pub mod import_datadir;
|
||||||
pub mod l0_flush;
|
pub mod l0_flush;
|
||||||
|
|
||||||
|
use futures::{stream::FuturesUnordered, StreamExt};
|
||||||
pub use pageserver_api::keyspace;
|
pub use pageserver_api::keyspace;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
pub mod aux_file;
|
pub mod aux_file;
|
||||||
@@ -30,14 +32,13 @@ pub mod walingest;
|
|||||||
pub mod walrecord;
|
pub mod walrecord;
|
||||||
pub mod walredo;
|
pub mod walredo;
|
||||||
|
|
||||||
use crate::task_mgr::TaskKind;
|
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use deletion_queue::DeletionQueue;
|
use deletion_queue::DeletionQueue;
|
||||||
use tenant::{
|
use tenant::{
|
||||||
mgr::{BackgroundPurges, TenantManager},
|
mgr::{BackgroundPurges, TenantManager},
|
||||||
secondary,
|
secondary,
|
||||||
};
|
};
|
||||||
use tracing::info;
|
use tracing::{info, info_span};
|
||||||
|
|
||||||
/// Current storage format version
|
/// Current storage format version
|
||||||
///
|
///
|
||||||
@@ -63,7 +64,6 @@ pub struct CancellableTask {
|
|||||||
pub cancel: CancellationToken,
|
pub cancel: CancellationToken,
|
||||||
}
|
}
|
||||||
pub struct HttpEndpointListener(pub CancellableTask);
|
pub struct HttpEndpointListener(pub CancellableTask);
|
||||||
pub struct LibpqEndpointListener(pub CancellableTask);
|
|
||||||
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
||||||
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
||||||
impl CancellableTask {
|
impl CancellableTask {
|
||||||
@@ -77,7 +77,7 @@ impl CancellableTask {
|
|||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub async fn shutdown_pageserver(
|
pub async fn shutdown_pageserver(
|
||||||
http_listener: HttpEndpointListener,
|
http_listener: HttpEndpointListener,
|
||||||
libpq_listener: LibpqEndpointListener,
|
page_service: page_service::Listener,
|
||||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||||
tenant_manager: &TenantManager,
|
tenant_manager: &TenantManager,
|
||||||
@@ -87,10 +87,83 @@ pub async fn shutdown_pageserver(
|
|||||||
exit_code: i32,
|
exit_code: i32,
|
||||||
) {
|
) {
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
|
// If the orderly shutdown below takes too long, we still want to make
|
||||||
|
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
|
||||||
|
//
|
||||||
|
// (Leftover walredo processes are the hypothesized trigger for the systemd freezes
|
||||||
|
// that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
|
||||||
|
//
|
||||||
|
// We use a thread instead of a tokio task because the background runtime is likely busy
|
||||||
|
// with the final flushing / uploads. This activity here has priority, and due to lack
|
||||||
|
// of scheduling priority feature sin the tokio scheduler, using a separate thread is
|
||||||
|
// an effective priority booster.
|
||||||
|
let walredo_extraordinary_shutdown_thread_span = {
|
||||||
|
let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
|
||||||
|
span.follows_from(tracing::Span::current());
|
||||||
|
span
|
||||||
|
};
|
||||||
|
let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
|
||||||
|
let walredo_extraordinary_shutdown_thread = std::thread::spawn({
|
||||||
|
let walredo_extraordinary_shutdown_thread_cancel =
|
||||||
|
walredo_extraordinary_shutdown_thread_cancel.clone();
|
||||||
|
move || {
|
||||||
|
let rt = tokio::runtime::Builder::new_current_thread()
|
||||||
|
.enable_all()
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
let _entered = rt.enter();
|
||||||
|
let _entered = walredo_extraordinary_shutdown_thread_span.enter();
|
||||||
|
if let Ok(()) = rt.block_on(tokio::time::timeout(
|
||||||
|
Duration::from_secs(8),
|
||||||
|
walredo_extraordinary_shutdown_thread_cancel.cancelled(),
|
||||||
|
)) {
|
||||||
|
info!("cancellation requested");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let managers = tenant::WALREDO_MANAGERS
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
// prevents new walredo managers from being inserted
|
||||||
|
.take()
|
||||||
|
.expect("only we take()");
|
||||||
|
// Use FuturesUnordered to get in queue early for each manager's
|
||||||
|
// heavier_once_cell semaphore wait list.
|
||||||
|
// Also, for idle tenants that for some reason haven't
|
||||||
|
// shut down yet, it's quite likely that we're not going
|
||||||
|
// to get Poll::Pending once.
|
||||||
|
let mut futs: FuturesUnordered<_> = managers
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|(_, mgr)| mgr.upgrade())
|
||||||
|
.map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
|
||||||
|
.collect();
|
||||||
|
info!(count=%futs.len(), "built FuturesUnordered");
|
||||||
|
let mut last_log_at = std::time::Instant::now();
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
struct Results {
|
||||||
|
initiated: u64,
|
||||||
|
already: u64,
|
||||||
|
}
|
||||||
|
let mut results = Results::default();
|
||||||
|
while let Some(we_initiated) = rt.block_on(futs.next()) {
|
||||||
|
if we_initiated {
|
||||||
|
results.initiated += 1;
|
||||||
|
} else {
|
||||||
|
results.already += 1;
|
||||||
|
}
|
||||||
|
if last_log_at.elapsed() > Duration::from_millis(100) {
|
||||||
|
info!(remaining=%futs.len(), ?results, "progress");
|
||||||
|
last_log_at = std::time::Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
info!(?results, "done");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// Shut down the libpq endpoint task. This prevents new connections from
|
// Shut down the libpq endpoint task. This prevents new connections from
|
||||||
// being accepted.
|
// being accepted.
|
||||||
timed(
|
let remaining_connections = timed(
|
||||||
libpq_listener.0.shutdown(),
|
page_service.stop_accepting(),
|
||||||
"shutdown LibpqEndpointListener",
|
"shutdown LibpqEndpointListener",
|
||||||
Duration::from_secs(1),
|
Duration::from_secs(1),
|
||||||
)
|
)
|
||||||
@@ -108,7 +181,7 @@ pub async fn shutdown_pageserver(
|
|||||||
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
||||||
// should already have been canclled via mgr::shutdown_all_tenants
|
// should already have been canclled via mgr::shutdown_all_tenants
|
||||||
timed(
|
timed(
|
||||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
remaining_connections.shutdown(),
|
||||||
"shutdown PageRequestHandlers",
|
"shutdown PageRequestHandlers",
|
||||||
Duration::from_secs(1),
|
Duration::from_secs(1),
|
||||||
)
|
)
|
||||||
@@ -162,6 +235,12 @@ pub async fn shutdown_pageserver(
|
|||||||
Duration::from_secs(1),
|
Duration::from_secs(1),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
info!("cancel & join walredo_extraordinary_shutdown_thread");
|
||||||
|
walredo_extraordinary_shutdown_thread_cancel.cancel();
|
||||||
|
walredo_extraordinary_shutdown_thread.join().unwrap();
|
||||||
|
info!("walredo_extraordinary_shutdown_thread done");
|
||||||
|
|
||||||
info!("Shut down successfully completed");
|
info!("Shut down successfully completed");
|
||||||
std::process::exit(exit_code);
|
std::process::exit(exit_code);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -525,6 +525,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
|
register_uint_gauge_vec!(
|
||||||
|
"pageserver_visible_physical_size",
|
||||||
|
"The size of the layer files present in the pageserver's filesystem.",
|
||||||
|
&["tenant_id", "shard_id", "timeline_id"]
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
|
pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
|
||||||
register_uint_gauge!(
|
register_uint_gauge!(
|
||||||
"pageserver_resident_physical_size_global",
|
"pageserver_resident_physical_size_global",
|
||||||
@@ -613,7 +622,23 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
|
|||||||
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||||
register_int_counter!(
|
register_int_counter!(
|
||||||
"pageserver_compression_image_in_bytes_total",
|
"pageserver_compression_image_in_bytes_total",
|
||||||
"Size of uncompressed data written into image layers"
|
"Size of data written into image layers before compression"
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
|
||||||
|
register_int_counter!(
|
||||||
|
"pageserver_compression_image_in_bytes_considered",
|
||||||
|
"Size of potentially compressible data written into image layers before compression"
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||||
|
register_int_counter!(
|
||||||
|
"pageserver_compression_image_in_bytes_chosen",
|
||||||
|
"Size of data whose compressed form was written into image layers"
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -2188,6 +2213,7 @@ pub(crate) struct TimelineMetrics {
|
|||||||
pub(crate) layer_count_delta: UIntGauge,
|
pub(crate) layer_count_delta: UIntGauge,
|
||||||
pub standby_horizon_gauge: IntGauge,
|
pub standby_horizon_gauge: IntGauge,
|
||||||
pub resident_physical_size_gauge: UIntGauge,
|
pub resident_physical_size_gauge: UIntGauge,
|
||||||
|
pub visible_physical_size_gauge: UIntGauge,
|
||||||
/// copy of LayeredTimeline.current_logical_size
|
/// copy of LayeredTimeline.current_logical_size
|
||||||
pub current_logical_size_gauge: UIntGauge,
|
pub current_logical_size_gauge: UIntGauge,
|
||||||
pub aux_file_size_gauge: IntGauge,
|
pub aux_file_size_gauge: IntGauge,
|
||||||
@@ -2310,6 +2336,9 @@ impl TimelineMetrics {
|
|||||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
|
||||||
|
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||||
|
.unwrap();
|
||||||
// TODO: we shouldn't expose this metric
|
// TODO: we shouldn't expose this metric
|
||||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||||
@@ -2364,6 +2393,7 @@ impl TimelineMetrics {
|
|||||||
layer_count_delta,
|
layer_count_delta,
|
||||||
standby_horizon_gauge,
|
standby_horizon_gauge,
|
||||||
resident_physical_size_gauge,
|
resident_physical_size_gauge,
|
||||||
|
visible_physical_size_gauge,
|
||||||
current_logical_size_gauge,
|
current_logical_size_gauge,
|
||||||
aux_file_size_gauge,
|
aux_file_size_gauge,
|
||||||
directory_entries_count_gauge,
|
directory_entries_count_gauge,
|
||||||
@@ -2415,6 +2445,7 @@ impl TimelineMetrics {
|
|||||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||||
}
|
}
|
||||||
|
let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||||
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -8,8 +8,7 @@ use std::time::Duration;
|
|||||||
pub use pageserver_api::key::{Key, KEY_SIZE};
|
pub use pageserver_api::key::{Key, KEY_SIZE};
|
||||||
|
|
||||||
/// A 'value' stored for a one Key.
|
/// A 'value' stored for a one Key.
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
#[cfg_attr(test, derive(PartialEq))]
|
|
||||||
pub enum Value {
|
pub enum Value {
|
||||||
/// An Image value contains a full copy of the value
|
/// An Image value contains a full copy of the value
|
||||||
Image(Bytes),
|
Image(Bytes),
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
|
|||||||
use remote_storage::TimeoutOrCancel;
|
use remote_storage::TimeoutOrCancel;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
use std::sync::Weak;
|
||||||
use std::time::SystemTime;
|
use std::time::SystemTime;
|
||||||
use storage_broker::BrokerClientChannel;
|
use storage_broker::BrokerClientChannel;
|
||||||
use tokio::io::BufReader;
|
use tokio::io::BufReader;
|
||||||
@@ -312,14 +313,66 @@ impl std::fmt::Debug for Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) enum WalRedoManager {
|
pub(crate) enum WalRedoManager {
|
||||||
Prod(PostgresRedoManager),
|
Prod(WalredoManagerId, PostgresRedoManager),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
Test(harness::TestRedoManager),
|
Test(harness::TestRedoManager),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<PostgresRedoManager> for WalRedoManager {
|
#[derive(thiserror::Error, Debug)]
|
||||||
fn from(mgr: PostgresRedoManager) -> Self {
|
#[error("pageserver is shutting down")]
|
||||||
Self::Prod(mgr)
|
pub(crate) struct GlobalShutDown;
|
||||||
|
|
||||||
|
impl WalRedoManager {
|
||||||
|
pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
|
||||||
|
let id = WalredoManagerId::next();
|
||||||
|
let arc = Arc::new(Self::Prod(id, mgr));
|
||||||
|
let mut guard = WALREDO_MANAGERS.lock().unwrap();
|
||||||
|
match &mut *guard {
|
||||||
|
Some(map) => {
|
||||||
|
map.insert(id, Arc::downgrade(&arc));
|
||||||
|
Ok(arc)
|
||||||
|
}
|
||||||
|
None => Err(GlobalShutDown),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for WalRedoManager {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
match self {
|
||||||
|
Self::Prod(id, _) => {
|
||||||
|
let mut guard = WALREDO_MANAGERS.lock().unwrap();
|
||||||
|
if let Some(map) = &mut *guard {
|
||||||
|
map.remove(id).expect("new() registers, drop() unregisters");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[cfg(test)]
|
||||||
|
Self::Test(_) => {
|
||||||
|
// Not applicable to test redo manager
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
|
||||||
|
/// the walredo processes outside of the regular order.
|
||||||
|
///
|
||||||
|
/// This is necessary to work around a systemd bug where it freezes if there are
|
||||||
|
/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
|
||||||
|
#[allow(clippy::type_complexity)]
|
||||||
|
pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
|
||||||
|
Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
|
||||||
|
> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
|
||||||
|
#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
|
||||||
|
pub(crate) struct WalredoManagerId(u64);
|
||||||
|
impl WalredoManagerId {
|
||||||
|
pub fn next() -> Self {
|
||||||
|
static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
|
||||||
|
let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||||
|
if id == 0 {
|
||||||
|
panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
|
||||||
|
}
|
||||||
|
Self(id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -331,19 +384,20 @@ impl From<harness::TestRedoManager> for WalRedoManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl WalRedoManager {
|
impl WalRedoManager {
|
||||||
pub(crate) async fn shutdown(&self) {
|
pub(crate) async fn shutdown(&self) -> bool {
|
||||||
match self {
|
match self {
|
||||||
Self::Prod(mgr) => mgr.shutdown().await,
|
Self::Prod(_, mgr) => mgr.shutdown().await,
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
Self::Test(_) => {
|
Self::Test(_) => {
|
||||||
// Not applicable to test redo manager
|
// Not applicable to test redo manager
|
||||||
|
true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
|
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
|
||||||
match self {
|
match self {
|
||||||
Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
|
Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
Self::Test(_) => {
|
Self::Test(_) => {
|
||||||
// Not applicable to test redo manager
|
// Not applicable to test redo manager
|
||||||
@@ -363,7 +417,7 @@ impl WalRedoManager {
|
|||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
) -> Result<bytes::Bytes, walredo::Error> {
|
) -> Result<bytes::Bytes, walredo::Error> {
|
||||||
match self {
|
match self {
|
||||||
Self::Prod(mgr) => {
|
Self::Prod(_, mgr) => {
|
||||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
@@ -377,7 +431,7 @@ impl WalRedoManager {
|
|||||||
|
|
||||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||||
match self {
|
match self {
|
||||||
WalRedoManager::Prod(m) => Some(m.status()),
|
WalRedoManager::Prod(_, m) => Some(m.status()),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
WalRedoManager::Test(_) => None,
|
WalRedoManager::Test(_) => None,
|
||||||
}
|
}
|
||||||
@@ -386,6 +440,8 @@ impl WalRedoManager {
|
|||||||
|
|
||||||
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
||||||
pub enum GetTimelineError {
|
pub enum GetTimelineError {
|
||||||
|
#[error("Timeline is shutting down")]
|
||||||
|
ShuttingDown,
|
||||||
#[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
|
#[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
|
||||||
NotActive {
|
NotActive {
|
||||||
tenant_id: TenantShardId,
|
tenant_id: TenantShardId,
|
||||||
@@ -675,11 +731,9 @@ impl Tenant {
|
|||||||
init_order: Option<InitializationOrder>,
|
init_order: Option<InitializationOrder>,
|
||||||
mode: SpawnMode,
|
mode: SpawnMode,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Arc<Tenant> {
|
) -> Result<Arc<Tenant>, GlobalShutDown> {
|
||||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
let wal_redo_manager =
|
||||||
conf,
|
WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
|
||||||
tenant_shard_id,
|
|
||||||
)));
|
|
||||||
|
|
||||||
let TenantSharedResources {
|
let TenantSharedResources {
|
||||||
broker_client,
|
broker_client,
|
||||||
@@ -878,7 +932,7 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
.instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
|
.instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
|
||||||
);
|
);
|
||||||
tenant
|
Ok(tenant)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
@@ -1580,7 +1634,7 @@ impl Tenant {
|
|||||||
self: Arc<Self>,
|
self: Arc<Self>,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Result<(), DeleteTimelineError> {
|
) -> Result<(), DeleteTimelineError> {
|
||||||
DeleteTimelineFlow::run(&self, timeline_id, false).await?;
|
DeleteTimelineFlow::run(&self, timeline_id).await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -6909,7 +6963,11 @@ mod tests {
|
|||||||
vec![
|
vec![
|
||||||
// Image layer at GC horizon
|
// Image layer at GC horizon
|
||||||
PersistentLayerKey {
|
PersistentLayerKey {
|
||||||
key_range: Key::MIN..Key::MAX,
|
key_range: {
|
||||||
|
let mut key = Key::MAX;
|
||||||
|
key.field6 -= 1;
|
||||||
|
Key::MIN..key
|
||||||
|
},
|
||||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||||
is_delta: false
|
is_delta: false
|
||||||
},
|
},
|
||||||
@@ -6928,6 +6986,15 @@ mod tests {
|
|||||||
]
|
]
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// increase GC horizon and compact again
|
||||||
|
{
|
||||||
|
// Update GC info
|
||||||
|
let mut guard = tline.gc_info.write().unwrap();
|
||||||
|
guard.cutoffs.time = Lsn(0x40);
|
||||||
|
guard.cutoffs.space = Lsn(0x40);
|
||||||
|
}
|
||||||
|
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -7279,6 +7346,15 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// increase GC horizon and compact again
|
||||||
|
{
|
||||||
|
// Update GC info
|
||||||
|
let mut guard = tline.gc_info.write().unwrap();
|
||||||
|
guard.cutoffs.time = Lsn(0x40);
|
||||||
|
guard.cutoffs.space = Lsn(0x40);
|
||||||
|
}
|
||||||
|
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -7347,6 +7423,7 @@ mod tests {
|
|||||||
Lsn(0x60),
|
Lsn(0x60),
|
||||||
&[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
|
&[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
|
||||||
3,
|
3,
|
||||||
|
None,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -7471,7 +7548,7 @@ mod tests {
|
|||||||
),
|
),
|
||||||
];
|
];
|
||||||
let res = tline
|
let res = tline
|
||||||
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
|
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let expected_res = KeyHistoryRetention {
|
let expected_res = KeyHistoryRetention {
|
||||||
@@ -7517,6 +7594,114 @@ mod tests {
|
|||||||
};
|
};
|
||||||
assert_eq!(res, expected_res);
|
assert_eq!(res, expected_res);
|
||||||
|
|
||||||
|
// In case of branch compaction, the branch itself does not have the full history, and we need to provide
|
||||||
|
// the ancestor image in the test case.
|
||||||
|
|
||||||
|
let history = vec![
|
||||||
|
(
|
||||||
|
key,
|
||||||
|
Lsn(0x20),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
key,
|
||||||
|
Lsn(0x30),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
key,
|
||||||
|
Lsn(0x40),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
key,
|
||||||
|
Lsn(0x70),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
let res = tline
|
||||||
|
.generate_key_retention(
|
||||||
|
key,
|
||||||
|
&history,
|
||||||
|
Lsn(0x60),
|
||||||
|
&[],
|
||||||
|
3,
|
||||||
|
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let expected_res = KeyHistoryRetention {
|
||||||
|
below_horizon: vec![(
|
||||||
|
Lsn(0x60),
|
||||||
|
KeyLogAtLsn(vec![(
|
||||||
|
Lsn(0x60),
|
||||||
|
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
|
||||||
|
)]),
|
||||||
|
)],
|
||||||
|
above_horizon: KeyLogAtLsn(vec![(
|
||||||
|
Lsn(0x70),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||||
|
)]),
|
||||||
|
};
|
||||||
|
assert_eq!(res, expected_res);
|
||||||
|
|
||||||
|
let history = vec![
|
||||||
|
(
|
||||||
|
key,
|
||||||
|
Lsn(0x20),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
key,
|
||||||
|
Lsn(0x40),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
key,
|
||||||
|
Lsn(0x60),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
key,
|
||||||
|
Lsn(0x70),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
let res = tline
|
||||||
|
.generate_key_retention(
|
||||||
|
key,
|
||||||
|
&history,
|
||||||
|
Lsn(0x60),
|
||||||
|
&[Lsn(0x30)],
|
||||||
|
3,
|
||||||
|
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let expected_res = KeyHistoryRetention {
|
||||||
|
below_horizon: vec![
|
||||||
|
(
|
||||||
|
Lsn(0x30),
|
||||||
|
KeyLogAtLsn(vec![(
|
||||||
|
Lsn(0x20),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
|
||||||
|
)]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Lsn(0x60),
|
||||||
|
KeyLogAtLsn(vec![(
|
||||||
|
Lsn(0x60),
|
||||||
|
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
|
||||||
|
)]),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
above_horizon: KeyLogAtLsn(vec![(
|
||||||
|
Lsn(0x70),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||||
|
)]),
|
||||||
|
};
|
||||||
|
assert_eq!(res, expected_res);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -7674,6 +7859,10 @@ mod tests {
|
|||||||
];
|
];
|
||||||
|
|
||||||
let verify_result = || async {
|
let verify_result = || async {
|
||||||
|
let gc_horizon = {
|
||||||
|
let gc_info = tline.gc_info.read().unwrap();
|
||||||
|
gc_info.cutoffs.time
|
||||||
|
};
|
||||||
for idx in 0..10 {
|
for idx in 0..10 {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
@@ -7684,7 +7873,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get(get_key(idx as u32), Lsn(0x30), &ctx)
|
.get(get_key(idx as u32), gc_horizon, &ctx)
|
||||||
.await
|
.await
|
||||||
.unwrap(),
|
.unwrap(),
|
||||||
&expected_result_at_gc_horizon[idx]
|
&expected_result_at_gc_horizon[idx]
|
||||||
@@ -7710,6 +7899,205 @@ mod tests {
|
|||||||
|
|
||||||
let cancel = CancellationToken::new();
|
let cancel = CancellationToken::new();
|
||||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||||
|
verify_result().await;
|
||||||
|
|
||||||
|
// compact again
|
||||||
|
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||||
|
verify_result().await;
|
||||||
|
|
||||||
|
// increase GC horizon and compact again
|
||||||
|
{
|
||||||
|
// Update GC info
|
||||||
|
let mut guard = tline.gc_info.write().unwrap();
|
||||||
|
guard.cutoffs.time = Lsn(0x38);
|
||||||
|
guard.cutoffs.space = Lsn(0x38);
|
||||||
|
}
|
||||||
|
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||||
|
verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
|
||||||
|
|
||||||
|
// not increasing the GC horizon and compact again
|
||||||
|
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||||
|
verify_result().await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||||
|
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
|
fn get_key(id: u32) -> Key {
|
||||||
|
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||||
|
key.field6 = id;
|
||||||
|
key
|
||||||
|
}
|
||||||
|
|
||||||
|
let img_layer = (0..10)
|
||||||
|
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||||
|
.collect_vec();
|
||||||
|
|
||||||
|
let delta1 = vec![
|
||||||
|
(
|
||||||
|
get_key(1),
|
||||||
|
Lsn(0x20),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(2),
|
||||||
|
Lsn(0x30),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(3),
|
||||||
|
Lsn(0x28),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(3),
|
||||||
|
Lsn(0x30),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(3),
|
||||||
|
Lsn(0x40),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
let delta2 = vec![
|
||||||
|
(
|
||||||
|
get_key(5),
|
||||||
|
Lsn(0x20),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(6),
|
||||||
|
Lsn(0x20),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
let delta3 = vec![
|
||||||
|
(
|
||||||
|
get_key(8),
|
||||||
|
Lsn(0x48),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(9),
|
||||||
|
Lsn(0x48),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
let parent_tline = tenant
|
||||||
|
.create_test_timeline_with_layers(
|
||||||
|
TIMELINE_ID,
|
||||||
|
Lsn(0x10),
|
||||||
|
DEFAULT_PG_VERSION,
|
||||||
|
&ctx,
|
||||||
|
vec![], // delta layers
|
||||||
|
vec![(Lsn(0x18), img_layer)], // image layers
|
||||||
|
Lsn(0x18),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
|
||||||
|
|
||||||
|
let branch_tline = tenant
|
||||||
|
.branch_timeline_test_with_layers(
|
||||||
|
&parent_tline,
|
||||||
|
NEW_TIMELINE_ID,
|
||||||
|
Some(Lsn(0x18)),
|
||||||
|
&ctx,
|
||||||
|
vec![
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
|
||||||
|
], // delta layers
|
||||||
|
vec![], // image layers
|
||||||
|
Lsn(0x50),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
|
||||||
|
|
||||||
|
{
|
||||||
|
// Update GC info
|
||||||
|
let mut guard = parent_tline.gc_info.write().unwrap();
|
||||||
|
*guard = GcInfo {
|
||||||
|
retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
|
||||||
|
cutoffs: GcCutoffs {
|
||||||
|
time: Lsn(0x10),
|
||||||
|
space: Lsn(0x10),
|
||||||
|
},
|
||||||
|
leases: Default::default(),
|
||||||
|
within_ancestor_pitr: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Update GC info
|
||||||
|
let mut guard = branch_tline.gc_info.write().unwrap();
|
||||||
|
*guard = GcInfo {
|
||||||
|
retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
|
||||||
|
cutoffs: GcCutoffs {
|
||||||
|
time: Lsn(0x50),
|
||||||
|
space: Lsn(0x50),
|
||||||
|
},
|
||||||
|
leases: Default::default(),
|
||||||
|
within_ancestor_pitr: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let expected_result_at_gc_horizon = [
|
||||||
|
Bytes::from_static(b"value 0@0x10"),
|
||||||
|
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||||
|
Bytes::from_static(b"value 2@0x10@0x30"),
|
||||||
|
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
|
||||||
|
Bytes::from_static(b"value 4@0x10"),
|
||||||
|
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||||
|
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||||
|
Bytes::from_static(b"value 7@0x10"),
|
||||||
|
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||||
|
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||||
|
];
|
||||||
|
|
||||||
|
let expected_result_at_lsn_40 = [
|
||||||
|
Bytes::from_static(b"value 0@0x10"),
|
||||||
|
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||||
|
Bytes::from_static(b"value 2@0x10@0x30"),
|
||||||
|
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
|
||||||
|
Bytes::from_static(b"value 4@0x10"),
|
||||||
|
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||||
|
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||||
|
Bytes::from_static(b"value 7@0x10"),
|
||||||
|
Bytes::from_static(b"value 8@0x10"),
|
||||||
|
Bytes::from_static(b"value 9@0x10"),
|
||||||
|
];
|
||||||
|
|
||||||
|
let verify_result = || async {
|
||||||
|
for idx in 0..10 {
|
||||||
|
assert_eq!(
|
||||||
|
branch_tline
|
||||||
|
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
&expected_result_at_gc_horizon[idx]
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
branch_tline
|
||||||
|
.get(get_key(idx as u32), Lsn(0x40), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
&expected_result_at_lsn_40[idx]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
verify_result().await;
|
||||||
|
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||||
|
|
||||||
verify_result().await;
|
verify_result().await;
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,12 @@ use crate::virtual_file::VirtualFile;
|
|||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
use std::io::{Error, ErrorKind};
|
use std::io::{Error, ErrorKind};
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, Debug)]
|
||||||
|
pub struct CompressionInfo {
|
||||||
|
pub written_compressed: bool,
|
||||||
|
pub compressed_size: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
impl<'a> BlockCursor<'a> {
|
impl<'a> BlockCursor<'a> {
|
||||||
/// Read a blob into a new buffer.
|
/// Read a blob into a new buffer.
|
||||||
pub async fn read_blob(
|
pub async fn read_blob(
|
||||||
@@ -273,8 +279,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
srcbuf: B,
|
srcbuf: B,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> (B::Buf, Result<u64, Error>) {
|
) -> (B::Buf, Result<u64, Error>) {
|
||||||
self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
let (buf, res) = self
|
||||||
.await
|
.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
||||||
|
.await;
|
||||||
|
(buf, res.map(|(off, _compression_info)| off))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Write a blob of data. Returns the offset that it was written to,
|
/// Write a blob of data. Returns the offset that it was written to,
|
||||||
@@ -284,8 +292,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
srcbuf: B,
|
srcbuf: B,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
algorithm: ImageCompressionAlgorithm,
|
algorithm: ImageCompressionAlgorithm,
|
||||||
) -> (B::Buf, Result<u64, Error>) {
|
) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
|
||||||
let offset = self.offset;
|
let offset = self.offset;
|
||||||
|
let mut compression_info = CompressionInfo {
|
||||||
|
written_compressed: false,
|
||||||
|
compressed_size: None,
|
||||||
|
};
|
||||||
|
|
||||||
let len = srcbuf.bytes_init();
|
let len = srcbuf.bytes_init();
|
||||||
|
|
||||||
@@ -328,7 +340,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
encoder.write_all(&slice[..]).await.unwrap();
|
encoder.write_all(&slice[..]).await.unwrap();
|
||||||
encoder.shutdown().await.unwrap();
|
encoder.shutdown().await.unwrap();
|
||||||
let compressed = encoder.into_inner();
|
let compressed = encoder.into_inner();
|
||||||
|
compression_info.compressed_size = Some(compressed.len());
|
||||||
if compressed.len() < len {
|
if compressed.len() < len {
|
||||||
|
compression_info.written_compressed = true;
|
||||||
let compressed_len = compressed.len();
|
let compressed_len = compressed.len();
|
||||||
compressed_buf = Some(compressed);
|
compressed_buf = Some(compressed);
|
||||||
(BYTE_ZSTD, compressed_len, slice.into_inner())
|
(BYTE_ZSTD, compressed_len, slice.into_inner())
|
||||||
@@ -359,7 +373,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
} else {
|
} else {
|
||||||
self.write_all(srcbuf, ctx).await
|
self.write_all(srcbuf, ctx).await
|
||||||
};
|
};
|
||||||
(srcbuf, res.map(|_| offset))
|
(srcbuf, res.map(|_| (offset, compression_info)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -416,12 +430,14 @@ pub(crate) mod tests {
|
|||||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||||
for blob in blobs.iter() {
|
for blob in blobs.iter() {
|
||||||
let (_, res) = if compression {
|
let (_, res) = if compression {
|
||||||
wtr.write_blob_maybe_compressed(
|
let res = wtr
|
||||||
blob.clone(),
|
.write_blob_maybe_compressed(
|
||||||
ctx,
|
blob.clone(),
|
||||||
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
ctx,
|
||||||
)
|
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
||||||
.await
|
)
|
||||||
|
.await;
|
||||||
|
(res.0, res.1.map(|(off, _)| off))
|
||||||
} else {
|
} else {
|
||||||
wtr.write_blob(blob.clone(), ctx).await
|
wtr.write_blob(blob.clone(), ctx).await
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -296,13 +296,19 @@ where
|
|||||||
let mut stack = Vec::new();
|
let mut stack = Vec::new();
|
||||||
stack.push((self.root_blk, None));
|
stack.push((self.root_blk, None));
|
||||||
let block_cursor = self.reader.block_cursor();
|
let block_cursor = self.reader.block_cursor();
|
||||||
|
let mut node_buf = [0_u8; PAGE_SZ];
|
||||||
while let Some((node_blknum, opt_iter)) = stack.pop() {
|
while let Some((node_blknum, opt_iter)) = stack.pop() {
|
||||||
// Locate the node.
|
// Read the node, through the PS PageCache, into local variable `node_buf`.
|
||||||
let node_buf = block_cursor
|
// We could keep the page cache read guard alive, but, at the time of writing,
|
||||||
|
// we run quite small PS PageCache s => can't risk running out of
|
||||||
|
// PageCache space because this stream isn't consumed fast enough.
|
||||||
|
let page_read_guard = block_cursor
|
||||||
.read_blk(self.start_blk + node_blknum, ctx)
|
.read_blk(self.start_blk + node_blknum, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
node_buf.copy_from_slice(page_read_guard.as_ref());
|
||||||
|
drop(page_read_guard); // drop page cache read guard early
|
||||||
|
|
||||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
let node = OnDiskNode::deparse(&node_buf)?;
|
||||||
let prefix_len = node.prefix_len as usize;
|
let prefix_len = node.prefix_len as usize;
|
||||||
let suffix_len = node.suffix_len as usize;
|
let suffix_len = node.suffix_len as usize;
|
||||||
|
|
||||||
@@ -345,6 +351,7 @@ where
|
|||||||
Either::Left(idx..node.num_children.into())
|
Either::Left(idx..node.num_children.into())
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// idx points to the first match now. Keep going from there
|
// idx points to the first match now. Keep going from there
|
||||||
while let Some(idx) = iter.next() {
|
while let Some(idx) = iter.next() {
|
||||||
let key_off = idx * suffix_len;
|
let key_off = idx * suffix_len;
|
||||||
|
|||||||
@@ -51,7 +51,8 @@ use crate::keyspace::KeyPartitioning;
|
|||||||
use crate::repository::Key;
|
use crate::repository::Key;
|
||||||
use crate::tenant::storage_layer::InMemoryLayer;
|
use crate::tenant::storage_layer::InMemoryLayer;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use pageserver_api::keyspace::KeySpaceAccum;
|
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
|
||||||
|
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
|
||||||
use std::collections::{HashMap, VecDeque};
|
use std::collections::{HashMap, VecDeque};
|
||||||
use std::iter::Peekable;
|
use std::iter::Peekable;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
@@ -61,7 +62,7 @@ use utils::lsn::Lsn;
|
|||||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||||
pub use historic_layer_coverage::LayerKey;
|
pub use historic_layer_coverage::LayerKey;
|
||||||
|
|
||||||
use super::storage_layer::PersistentLayerDesc;
|
use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
|
||||||
|
|
||||||
///
|
///
|
||||||
/// LayerMap tracks what layers exist on a timeline.
|
/// LayerMap tracks what layers exist on a timeline.
|
||||||
@@ -871,11 +872,183 @@ impl LayerMap {
|
|||||||
println!("End dump LayerMap");
|
println!("End dump LayerMap");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// `read_points` represent the tip of a timeline and any branch points, i.e. the places
|
||||||
|
/// where we expect to serve reads.
|
||||||
|
///
|
||||||
|
/// This function is O(N) and should be called infrequently. The caller is responsible for
|
||||||
|
/// looking up and updating the Layer objects for these layer descriptors.
|
||||||
|
pub fn get_visibility(
|
||||||
|
&self,
|
||||||
|
mut read_points: Vec<Lsn>,
|
||||||
|
) -> (
|
||||||
|
Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
|
||||||
|
KeySpace,
|
||||||
|
) {
|
||||||
|
// This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
|
||||||
|
// KeySpace is intended to be composed statically and iterated over.
|
||||||
|
struct KeyShadow {
|
||||||
|
// Map of range start to range end
|
||||||
|
inner: RangeSetBlaze<i128>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KeyShadow {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
inner: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn contains(&self, range: Range<Key>) -> bool {
|
||||||
|
let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
|
||||||
|
self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
|
||||||
|
CheckSortedDisjoint::from([range_incl]),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add the input range to the keys covered by self.
|
||||||
|
///
|
||||||
|
/// Return true if inserting this range covered some keys that were previously not covered
|
||||||
|
fn cover(&mut self, insert: Range<Key>) -> bool {
|
||||||
|
let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
|
||||||
|
self.inner.ranges_insert(range_incl)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reset(&mut self) {
|
||||||
|
self.inner = Default::default();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_keyspace(&self) -> KeySpace {
|
||||||
|
let mut accum = KeySpaceAccum::new();
|
||||||
|
for range_incl in self.inner.ranges() {
|
||||||
|
let range = Range {
|
||||||
|
start: Key::from_i128(*range_incl.start()),
|
||||||
|
end: Key::from_i128(range_incl.end() + 1),
|
||||||
|
};
|
||||||
|
accum.add_range(range)
|
||||||
|
}
|
||||||
|
|
||||||
|
accum.to_keyspace()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
|
||||||
|
// and a ReadPoint
|
||||||
|
read_points.sort_by_key(|rp| rp.0);
|
||||||
|
let mut shadow = KeyShadow::new();
|
||||||
|
|
||||||
|
// We will interleave all our read points and layers into a sorted collection
|
||||||
|
enum Item {
|
||||||
|
ReadPoint { lsn: Lsn },
|
||||||
|
Layer(Arc<PersistentLayerDesc>),
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
|
||||||
|
items.extend(self.iter_historic_layers().map(Item::Layer));
|
||||||
|
items.extend(
|
||||||
|
read_points
|
||||||
|
.into_iter()
|
||||||
|
.map(|rp| Item::ReadPoint { lsn: rp }),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Ordering: we want to iterate like this:
|
||||||
|
// 1. Highest LSNs first
|
||||||
|
// 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
|
||||||
|
// 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
|
||||||
|
items.sort_by_key(|item| {
|
||||||
|
std::cmp::Reverse(match item {
|
||||||
|
Item::Layer(layer) => {
|
||||||
|
if layer.is_delta() {
|
||||||
|
(Lsn(layer.get_lsn_range().end.0 - 1), 0)
|
||||||
|
} else {
|
||||||
|
(layer.image_layer_lsn(), 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Item::ReadPoint { lsn } => (*lsn, 2),
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut results = Vec::with_capacity(self.historic.len());
|
||||||
|
|
||||||
|
let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
|
||||||
|
|
||||||
|
for item in items {
|
||||||
|
let (reached_lsn, is_readpoint) = match &item {
|
||||||
|
Item::ReadPoint { lsn } => (lsn, true),
|
||||||
|
Item::Layer(layer) => (&layer.lsn_range.start, false),
|
||||||
|
};
|
||||||
|
maybe_covered_deltas.retain(|d| {
|
||||||
|
if *reached_lsn >= d.lsn_range.start && is_readpoint {
|
||||||
|
// We encountered a readpoint within the delta layer: it is visible
|
||||||
|
|
||||||
|
results.push((d.clone(), LayerVisibilityHint::Visible));
|
||||||
|
false
|
||||||
|
} else if *reached_lsn < d.lsn_range.start {
|
||||||
|
// We passed the layer's range without encountering a read point: it is not visible
|
||||||
|
results.push((d.clone(), LayerVisibilityHint::Covered));
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
// We're still in the delta layer: continue iterating
|
||||||
|
true
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
match item {
|
||||||
|
Item::ReadPoint { lsn: _lsn } => {
|
||||||
|
// TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
|
||||||
|
// to assume that the whole key range is visible at the branch point.
|
||||||
|
shadow.reset();
|
||||||
|
}
|
||||||
|
Item::Layer(layer) => {
|
||||||
|
let visibility = if layer.is_delta() {
|
||||||
|
if shadow.contains(layer.get_key_range()) {
|
||||||
|
// If a layer isn't visible based on current state, we must defer deciding whether
|
||||||
|
// it is truly not visible until we have advanced past the delta's range: we might
|
||||||
|
// encounter another branch point within this delta layer's LSN range.
|
||||||
|
maybe_covered_deltas.push(layer);
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
LayerVisibilityHint::Visible
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let modified = shadow.cover(layer.get_key_range());
|
||||||
|
if modified {
|
||||||
|
// An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
|
||||||
|
LayerVisibilityHint::Visible
|
||||||
|
} else {
|
||||||
|
// An image layer in a region that was already covered
|
||||||
|
LayerVisibilityHint::Covered
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
results.push((layer, visibility));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Drain any remaining maybe_covered deltas
|
||||||
|
results.extend(
|
||||||
|
maybe_covered_deltas
|
||||||
|
.into_iter()
|
||||||
|
.map(|d| (d, LayerVisibilityHint::Covered)),
|
||||||
|
);
|
||||||
|
|
||||||
|
(results, shadow.to_keyspace())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use pageserver_api::keyspace::KeySpace;
|
use crate::tenant::{storage_layer::LayerName, IndexPart};
|
||||||
|
use pageserver_api::{
|
||||||
|
key::DBDIR_KEY,
|
||||||
|
keyspace::{KeySpace, KeySpaceRandomAccum},
|
||||||
|
};
|
||||||
|
use std::{collections::HashMap, path::PathBuf};
|
||||||
|
use utils::{
|
||||||
|
id::{TenantId, TimelineId},
|
||||||
|
shard::TenantShardId,
|
||||||
|
};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
@@ -1002,4 +1175,299 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layer_visibility_basic() {
|
||||||
|
// A simple synthetic input, as a smoke test.
|
||||||
|
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||||
|
let timeline_id = TimelineId::generate();
|
||||||
|
let mut layer_map = LayerMap::default();
|
||||||
|
let mut updates = layer_map.batch_update();
|
||||||
|
|
||||||
|
const FAKE_LAYER_SIZE: u64 = 1024;
|
||||||
|
|
||||||
|
let inject_delta = |updates: &mut BatchedUpdates,
|
||||||
|
key_start: i128,
|
||||||
|
key_end: i128,
|
||||||
|
lsn_start: u64,
|
||||||
|
lsn_end: u64| {
|
||||||
|
let desc = PersistentLayerDesc::new_delta(
|
||||||
|
tenant_shard_id,
|
||||||
|
timeline_id,
|
||||||
|
Range {
|
||||||
|
start: Key::from_i128(key_start),
|
||||||
|
end: Key::from_i128(key_end),
|
||||||
|
},
|
||||||
|
Range {
|
||||||
|
start: Lsn(lsn_start),
|
||||||
|
end: Lsn(lsn_end),
|
||||||
|
},
|
||||||
|
1024,
|
||||||
|
);
|
||||||
|
updates.insert_historic(desc.clone());
|
||||||
|
desc
|
||||||
|
};
|
||||||
|
|
||||||
|
let inject_image =
|
||||||
|
|updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
|
||||||
|
let desc = PersistentLayerDesc::new_img(
|
||||||
|
tenant_shard_id,
|
||||||
|
timeline_id,
|
||||||
|
Range {
|
||||||
|
start: Key::from_i128(key_start),
|
||||||
|
end: Key::from_i128(key_end),
|
||||||
|
},
|
||||||
|
Lsn(lsn),
|
||||||
|
FAKE_LAYER_SIZE,
|
||||||
|
);
|
||||||
|
updates.insert_historic(desc.clone());
|
||||||
|
desc
|
||||||
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
|
||||||
|
// we expect to handle. You can follow these examples through in the same order as they would be processed
|
||||||
|
// by the function under test.
|
||||||
|
//
|
||||||
|
|
||||||
|
let mut read_points = vec![Lsn(1000)];
|
||||||
|
|
||||||
|
// A delta ahead of any image layer
|
||||||
|
let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
|
||||||
|
|
||||||
|
// An image layer is visible and covers some layers beneath itself
|
||||||
|
let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
|
||||||
|
|
||||||
|
// A delta layer covered by the image layer: should be covered
|
||||||
|
let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
|
||||||
|
|
||||||
|
// A delta layer partially covered by an image layer: should be visible
|
||||||
|
let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
|
||||||
|
|
||||||
|
// A delta layer not covered by an image layer: should be visible
|
||||||
|
let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
|
||||||
|
|
||||||
|
// An image layer covered by the image layer above: should be covered
|
||||||
|
let covered_image = inject_image(&mut updates, 10, 20, 89);
|
||||||
|
|
||||||
|
// An image layer partially covered by an image layer: should be visible
|
||||||
|
let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
|
||||||
|
|
||||||
|
// An image layer not covered by an image layer: should be visible
|
||||||
|
let not_covered_image = inject_image(&mut updates, 1, 4, 89);
|
||||||
|
|
||||||
|
// A read point: this will make subsequent layers below here visible, even if there are
|
||||||
|
// more recent layers covering them.
|
||||||
|
read_points.push(Lsn(80));
|
||||||
|
|
||||||
|
// A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
|
||||||
|
let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
|
||||||
|
|
||||||
|
// A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
|
||||||
|
// the read point should make it visible, even though its end LSN is covered
|
||||||
|
let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
|
||||||
|
let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
|
||||||
|
read_points.push(Lsn(65));
|
||||||
|
let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
|
||||||
|
|
||||||
|
let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
|
||||||
|
|
||||||
|
updates.flush();
|
||||||
|
|
||||||
|
let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
|
||||||
|
let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&ahead_layer),
|
||||||
|
Some(&LayerVisibilityHint::Visible)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&visible_covering_img),
|
||||||
|
Some(&LayerVisibilityHint::Visible)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&covered_delta),
|
||||||
|
Some(&LayerVisibilityHint::Covered)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&partially_covered_delta),
|
||||||
|
Some(&LayerVisibilityHint::Visible)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(¬_covered_delta),
|
||||||
|
Some(&LayerVisibilityHint::Visible)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&covered_image),
|
||||||
|
Some(&LayerVisibilityHint::Covered)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&partially_covered_image),
|
||||||
|
Some(&LayerVisibilityHint::Visible)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(¬_covered_image),
|
||||||
|
Some(&LayerVisibilityHint::Visible)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&covered_delta_below_read_point),
|
||||||
|
Some(&LayerVisibilityHint::Visible)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&covering_img_between_read_points),
|
||||||
|
Some(&LayerVisibilityHint::Visible)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&covered_delta_between_read_points),
|
||||||
|
Some(&LayerVisibilityHint::Covered)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&covered_delta_intersects_read_point),
|
||||||
|
Some(&LayerVisibilityHint::Visible)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
layer_visibilities.get(&visible_img_after_last_read_point),
|
||||||
|
Some(&LayerVisibilityHint::Visible)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Shadow should include all the images below the last read point
|
||||||
|
let expected_shadow = KeySpace {
|
||||||
|
ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
|
||||||
|
};
|
||||||
|
assert_eq!(shadow, expected_shadow);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fixture_path(relative: &str) -> PathBuf {
|
||||||
|
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layer_visibility_realistic() {
|
||||||
|
// Load a large example layermap
|
||||||
|
let index_raw = std::fs::read_to_string(fixture_path(
|
||||||
|
"test_data/indices/mixed_workload/index_part.json",
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
|
let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
|
||||||
|
|
||||||
|
let tenant_id = TenantId::generate();
|
||||||
|
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||||
|
let timeline_id = TimelineId::generate();
|
||||||
|
|
||||||
|
let mut layer_map = LayerMap::default();
|
||||||
|
let mut updates = layer_map.batch_update();
|
||||||
|
for (layer_name, layer_metadata) in index.layer_metadata {
|
||||||
|
let layer_desc = match layer_name {
|
||||||
|
LayerName::Image(layer_name) => PersistentLayerDesc {
|
||||||
|
key_range: layer_name.key_range.clone(),
|
||||||
|
lsn_range: layer_name.lsn_as_range(),
|
||||||
|
tenant_shard_id,
|
||||||
|
timeline_id,
|
||||||
|
is_delta: false,
|
||||||
|
file_size: layer_metadata.file_size,
|
||||||
|
},
|
||||||
|
LayerName::Delta(layer_name) => PersistentLayerDesc {
|
||||||
|
key_range: layer_name.key_range,
|
||||||
|
lsn_range: layer_name.lsn_range,
|
||||||
|
tenant_shard_id,
|
||||||
|
timeline_id,
|
||||||
|
is_delta: true,
|
||||||
|
file_size: layer_metadata.file_size,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
updates.insert_historic(layer_desc);
|
||||||
|
}
|
||||||
|
updates.flush();
|
||||||
|
|
||||||
|
let read_points = vec![index.metadata.disk_consistent_lsn()];
|
||||||
|
let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
|
||||||
|
for (layer_desc, visibility) in &layer_visibilities {
|
||||||
|
tracing::info!("{layer_desc:?}: {visibility:?}");
|
||||||
|
eprintln!("{layer_desc:?}: {visibility:?}");
|
||||||
|
}
|
||||||
|
|
||||||
|
// The shadow should be non-empty, since there were some image layers
|
||||||
|
assert!(!shadow.ranges.is_empty());
|
||||||
|
|
||||||
|
// At least some layers should be marked covered
|
||||||
|
assert!(layer_visibilities
|
||||||
|
.iter()
|
||||||
|
.any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
|
||||||
|
|
||||||
|
let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
|
||||||
|
|
||||||
|
// Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
|
||||||
|
for (layer_desc, visible) in &layer_visibilities {
|
||||||
|
let mut coverage = KeySpaceRandomAccum::new();
|
||||||
|
let mut covered_by = Vec::new();
|
||||||
|
|
||||||
|
for other_layer in layer_map.iter_historic_layers() {
|
||||||
|
if &other_layer == layer_desc {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if !other_layer.is_delta()
|
||||||
|
&& other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
|
||||||
|
&& other_layer.key_range.start <= layer_desc.key_range.end
|
||||||
|
&& layer_desc.key_range.start <= other_layer.key_range.end
|
||||||
|
{
|
||||||
|
coverage.add_range(other_layer.get_key_range());
|
||||||
|
covered_by.push((*other_layer).clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let coverage = coverage.to_keyspace();
|
||||||
|
|
||||||
|
let expect_visible = if coverage.ranges.len() == 1
|
||||||
|
&& coverage.contains(&layer_desc.key_range.start)
|
||||||
|
&& coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
|
||||||
|
{
|
||||||
|
LayerVisibilityHint::Covered
|
||||||
|
} else {
|
||||||
|
LayerVisibilityHint::Visible
|
||||||
|
};
|
||||||
|
|
||||||
|
if expect_visible != *visible {
|
||||||
|
eprintln!(
|
||||||
|
"Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
|
||||||
|
layer_desc.key_range.start,
|
||||||
|
layer_desc.key_range.end,
|
||||||
|
layer_desc.lsn_range.start,
|
||||||
|
layer_desc.lsn_range.end,
|
||||||
|
layer_desc.is_delta()
|
||||||
|
);
|
||||||
|
if expect_visible == LayerVisibilityHint::Covered {
|
||||||
|
eprintln!("Covered by:");
|
||||||
|
for other in covered_by {
|
||||||
|
eprintln!(
|
||||||
|
" {}..{} @ {}",
|
||||||
|
other.get_key_range().start,
|
||||||
|
other.get_key_range().end,
|
||||||
|
other.image_layer_lsn()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if let Some(range) = coverage.ranges.first() {
|
||||||
|
eprintln!(
|
||||||
|
"Total coverage from contributing layers: {}..{}",
|
||||||
|
range.start, range.end
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
eprintln!(
|
||||||
|
"Total coverage from contributing layers: {:?}",
|
||||||
|
coverage.ranges
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert_eq!(expect_visible, *visible);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sanity: the layer that holds latest data for the DBDIR key should always be visible
|
||||||
|
// (just using this key as a key that will always exist for any layermap fixture)
|
||||||
|
let dbdir_layer = layer_map
|
||||||
|
.search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
|
||||||
|
.unwrap();
|
||||||
|
assert!(matches!(
|
||||||
|
layer_visibilities.get(&dbdir_layer.layer).unwrap(),
|
||||||
|
LayerVisibilityHint::Visible
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -521,6 +521,10 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
|||||||
|
|
||||||
Ok(&self.historic_coverage)
|
Ok(&self.historic_coverage)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn len(&self) -> usize {
|
||||||
|
self.layers.len()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ use utils::id::{TenantId, TimelineId};
|
|||||||
use super::remote_timeline_client::remote_tenant_path;
|
use super::remote_timeline_client::remote_tenant_path;
|
||||||
use super::secondary::SecondaryTenant;
|
use super::secondary::SecondaryTenant;
|
||||||
use super::timeline::detach_ancestor::PreparedTimelineDetach;
|
use super::timeline::detach_ancestor::PreparedTimelineDetach;
|
||||||
use super::TenantSharedResources;
|
use super::{GlobalShutDown, TenantSharedResources};
|
||||||
|
|
||||||
/// For a tenant that appears in TenantsMap, it may either be
|
/// For a tenant that appears in TenantsMap, it may either be
|
||||||
/// - `Attached`: has a full Tenant object, is elegible to service
|
/// - `Attached`: has a full Tenant object, is elegible to service
|
||||||
@@ -116,8 +116,6 @@ pub(crate) enum ShardSelector {
|
|||||||
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
|
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
|
||||||
/// ignore it.
|
/// ignore it.
|
||||||
Zero,
|
Zero,
|
||||||
/// Pick the first shard we find for the TenantId
|
|
||||||
First,
|
|
||||||
/// Pick the shard that holds this key
|
/// Pick the shard that holds this key
|
||||||
Page(Key),
|
Page(Key),
|
||||||
/// The shard ID is known: pick the given shard
|
/// The shard ID is known: pick the given shard
|
||||||
@@ -667,17 +665,20 @@ pub async fn init_tenant_mgr(
|
|||||||
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||||
let shard_identity = location_conf.shard;
|
let shard_identity = location_conf.shard;
|
||||||
let slot = match location_conf.mode {
|
let slot = match location_conf.mode {
|
||||||
LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
|
LocationMode::Attached(attached_conf) => TenantSlot::Attached(
|
||||||
conf,
|
tenant_spawn(
|
||||||
tenant_shard_id,
|
conf,
|
||||||
&tenant_dir_path,
|
tenant_shard_id,
|
||||||
resources.clone(),
|
&tenant_dir_path,
|
||||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
resources.clone(),
|
||||||
shard_identity,
|
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||||
Some(init_order.clone()),
|
shard_identity,
|
||||||
SpawnMode::Lazy,
|
Some(init_order.clone()),
|
||||||
&ctx,
|
SpawnMode::Lazy,
|
||||||
)),
|
&ctx,
|
||||||
|
)
|
||||||
|
.expect("global shutdown during init_tenant_mgr cannot happen"),
|
||||||
|
),
|
||||||
LocationMode::Secondary(secondary_conf) => {
|
LocationMode::Secondary(secondary_conf) => {
|
||||||
info!(
|
info!(
|
||||||
tenant_id = %tenant_shard_id.tenant_id,
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
@@ -725,7 +726,7 @@ fn tenant_spawn(
|
|||||||
init_order: Option<InitializationOrder>,
|
init_order: Option<InitializationOrder>,
|
||||||
mode: SpawnMode,
|
mode: SpawnMode,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Arc<Tenant> {
|
) -> Result<Arc<Tenant>, GlobalShutDown> {
|
||||||
// All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
|
// All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
|
||||||
// path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode
|
// path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode
|
||||||
// to avoid impacting prod runtime performance.
|
// to avoid impacting prod runtime performance.
|
||||||
@@ -1192,7 +1193,10 @@ impl TenantManager {
|
|||||||
None,
|
None,
|
||||||
spawn_mode,
|
spawn_mode,
|
||||||
ctx,
|
ctx,
|
||||||
);
|
)
|
||||||
|
.map_err(|_: GlobalShutDown| {
|
||||||
|
UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
|
||||||
|
})?;
|
||||||
|
|
||||||
TenantSlot::Attached(tenant)
|
TenantSlot::Attached(tenant)
|
||||||
}
|
}
|
||||||
@@ -1313,7 +1317,7 @@ impl TenantManager {
|
|||||||
None,
|
None,
|
||||||
SpawnMode::Eager,
|
SpawnMode::Eager,
|
||||||
ctx,
|
ctx,
|
||||||
);
|
)?;
|
||||||
|
|
||||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||||
|
|
||||||
@@ -1384,34 +1388,32 @@ impl TenantManager {
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
let remote_path = remote_tenant_path(&tenant_shard_id);
|
let remote_path = remote_tenant_path(&tenant_shard_id);
|
||||||
let keys = match self
|
let mut keys_stream = self.resources.remote_storage.list_streaming(
|
||||||
.resources
|
Some(&remote_path),
|
||||||
.remote_storage
|
remote_storage::ListingMode::NoDelimiter,
|
||||||
.list(
|
None,
|
||||||
Some(&remote_path),
|
&self.cancel,
|
||||||
remote_storage::ListingMode::NoDelimiter,
|
);
|
||||||
None,
|
while let Some(chunk) = keys_stream.next().await {
|
||||||
&self.cancel,
|
let keys = match chunk {
|
||||||
)
|
Ok(listing) => listing.keys,
|
||||||
.await
|
Err(remote_storage::DownloadError::Cancelled) => {
|
||||||
{
|
return Err(DeleteTenantError::Cancelled)
|
||||||
Ok(listing) => listing.keys,
|
}
|
||||||
Err(remote_storage::DownloadError::Cancelled) => {
|
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
|
||||||
return Err(DeleteTenantError::Cancelled)
|
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
|
||||||
}
|
};
|
||||||
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
|
|
||||||
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
|
|
||||||
};
|
|
||||||
|
|
||||||
if keys.is_empty() {
|
if keys.is_empty() {
|
||||||
tracing::info!("Remote storage already deleted");
|
tracing::info!("Remote storage already deleted");
|
||||||
} else {
|
} else {
|
||||||
tracing::info!("Deleting {} keys from remote storage", keys.len());
|
tracing::info!("Deleting {} keys from remote storage", keys.len());
|
||||||
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
|
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
|
||||||
self.resources
|
self.resources
|
||||||
.remote_storage
|
.remote_storage
|
||||||
.delete_objects(&keys, &self.cancel)
|
.delete_objects(&keys, &self.cancel)
|
||||||
.await?;
|
.await?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -2049,7 +2051,7 @@ impl TenantManager {
|
|||||||
None,
|
None,
|
||||||
SpawnMode::Eager,
|
SpawnMode::Eager,
|
||||||
ctx,
|
ctx,
|
||||||
);
|
)?;
|
||||||
|
|
||||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||||
|
|
||||||
@@ -2090,7 +2092,6 @@ impl TenantManager {
|
|||||||
};
|
};
|
||||||
|
|
||||||
match selector {
|
match selector {
|
||||||
ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
|
|
||||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||||
return ShardResolveResult::Found(tenant.clone())
|
return ShardResolveResult::Found(tenant.clone())
|
||||||
}
|
}
|
||||||
@@ -2172,6 +2173,9 @@ pub(crate) enum GetActiveTenantError {
|
|||||||
/// never happen.
|
/// never happen.
|
||||||
#[error("Tenant is broken: {0}")]
|
#[error("Tenant is broken: {0}")]
|
||||||
Broken(String),
|
Broken(String),
|
||||||
|
|
||||||
|
#[error("reconnect to switch tenant id")]
|
||||||
|
SwitchedTenant,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
|||||||
@@ -1378,6 +1378,18 @@ impl RemoteTimelineClient {
|
|||||||
.dirty
|
.dirty
|
||||||
.layer_metadata
|
.layer_metadata
|
||||||
.drain()
|
.drain()
|
||||||
|
.filter(|(_file_name, meta)| {
|
||||||
|
// Filter out layers that belonged to an ancestor shard. Since we are deleting the whole timeline from
|
||||||
|
// all shards anyway, we _could_ delete these, but
|
||||||
|
// - it creates a potential race if other shards are still
|
||||||
|
// using the layers while this shard deletes them.
|
||||||
|
// - it means that if we rolled back the shard split, the ancestor shards would be in a state where
|
||||||
|
// these timelines are present but corrupt (their index exists but some layers don't)
|
||||||
|
//
|
||||||
|
// These layers will eventually be cleaned up by the scrubber when it does physical GC.
|
||||||
|
meta.shard.shard_number == self.tenant_shard_id.shard_number
|
||||||
|
&& meta.shard.shard_count == self.tenant_shard_id.shard_count
|
||||||
|
})
|
||||||
.map(|(file_name, meta)| {
|
.map(|(file_name, meta)| {
|
||||||
remote_layer_path(
|
remote_layer_path(
|
||||||
&self.tenant_shard_id.tenant_id,
|
&self.tenant_shard_id.tenant_id,
|
||||||
|
|||||||
@@ -8,6 +8,9 @@ mod layer_desc;
|
|||||||
mod layer_name;
|
mod layer_name;
|
||||||
pub mod merge_iterator;
|
pub mod merge_iterator;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub mod split_writer;
|
||||||
|
|
||||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||||
use crate::repository::Value;
|
use crate::repository::Value;
|
||||||
use crate::walrecord::NeonWalRecord;
|
use crate::walrecord::NeonWalRecord;
|
||||||
@@ -451,20 +454,14 @@ pub enum ValueReconstructResult {
|
|||||||
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
|
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
|
||||||
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
|
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
|
||||||
/// be used for cache management but not for correctness-critical checks.
|
/// be used for cache management but not for correctness-critical checks.
|
||||||
#[derive(Default, Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub(crate) enum LayerVisibilityHint {
|
pub enum LayerVisibilityHint {
|
||||||
/// A Visible layer might be read while serving a read, because there is not an image layer between it
|
/// A Visible layer might be read while serving a read, because there is not an image layer between it
|
||||||
/// and a readable LSN (the tip of the branch or a child's branch point)
|
/// and a readable LSN (the tip of the branch or a child's branch point)
|
||||||
Visible,
|
Visible,
|
||||||
/// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
|
/// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
|
||||||
/// a branch or ephemeral endpoint at an LSN below the layer that covers this.
|
/// a branch or ephemeral endpoint at an LSN below the layer that covers this.
|
||||||
#[allow(unused)]
|
|
||||||
Covered,
|
Covered,
|
||||||
/// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
|
|
||||||
/// in this state. Note that newly written layers may be called Visible immediately, this uninitialized
|
|
||||||
/// state is for when existing layers are constructed while loading a timeline.
|
|
||||||
#[default]
|
|
||||||
Uninitialized,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
|
pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
|
||||||
@@ -626,23 +623,30 @@ impl LayerAccessStats {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
|
/// Helper for extracting the visibility hint from the literal value of our inner u64
|
||||||
let value = match visibility {
|
fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
|
||||||
LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
|
match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
|
||||||
LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
|
|
||||||
};
|
|
||||||
|
|
||||||
self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
|
|
||||||
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
|
|
||||||
match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
|
|
||||||
1 => LayerVisibilityHint::Visible,
|
1 => LayerVisibilityHint::Visible,
|
||||||
0 => LayerVisibilityHint::Covered,
|
0 => LayerVisibilityHint::Covered,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the old value which has been replaced
|
||||||
|
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
|
||||||
|
let value = match visibility {
|
||||||
|
LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
|
||||||
|
LayerVisibilityHint::Covered => 0x0,
|
||||||
|
};
|
||||||
|
|
||||||
|
let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
|
||||||
|
self.decode_visibility(old_bits)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
|
||||||
|
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
|
||||||
|
self.decode_visibility(read)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get a layer descriptor from a layer.
|
/// Get a layer descriptor from a layer.
|
||||||
|
|||||||
@@ -467,7 +467,7 @@ impl DeltaLayerWriterInner {
|
|||||||
.write_blob_maybe_compressed(val, ctx, compression)
|
.write_blob_maybe_compressed(val, ctx, compression)
|
||||||
.await;
|
.await;
|
||||||
let off = match res {
|
let off = match res {
|
||||||
Ok(off) => off,
|
Ok((off, _)) => off,
|
||||||
Err(e) => return (val, Err(anyhow::anyhow!(e))),
|
Err(e) => return (val, Err(anyhow::anyhow!(e))),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -734,8 +734,22 @@ struct ImageLayerWriterInner {
|
|||||||
// Total uncompressed bytes passed into put_image
|
// Total uncompressed bytes passed into put_image
|
||||||
uncompressed_bytes: u64,
|
uncompressed_bytes: u64,
|
||||||
|
|
||||||
|
// Like `uncompressed_bytes`,
|
||||||
|
// but only of images we might consider for compression
|
||||||
|
uncompressed_bytes_eligible: u64,
|
||||||
|
|
||||||
|
// Like `uncompressed_bytes`, but only of images
|
||||||
|
// where we have chosen their compressed form
|
||||||
|
uncompressed_bytes_chosen: u64,
|
||||||
|
|
||||||
|
// Number of keys in the layer.
|
||||||
|
num_keys: usize,
|
||||||
|
|
||||||
blob_writer: BlobWriter<false>,
|
blob_writer: BlobWriter<false>,
|
||||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||||
|
|
||||||
|
#[cfg_attr(not(feature = "testing"), allow(dead_code))]
|
||||||
|
last_written_key: Key,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ImageLayerWriterInner {
|
impl ImageLayerWriterInner {
|
||||||
@@ -790,6 +804,10 @@ impl ImageLayerWriterInner {
|
|||||||
tree: tree_builder,
|
tree: tree_builder,
|
||||||
blob_writer,
|
blob_writer,
|
||||||
uncompressed_bytes: 0,
|
uncompressed_bytes: 0,
|
||||||
|
uncompressed_bytes_eligible: 0,
|
||||||
|
uncompressed_bytes_chosen: 0,
|
||||||
|
num_keys: 0,
|
||||||
|
last_written_key: Key::MIN,
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(writer)
|
Ok(writer)
|
||||||
@@ -808,18 +826,33 @@ impl ImageLayerWriterInner {
|
|||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
ensure!(self.key_range.contains(&key));
|
ensure!(self.key_range.contains(&key));
|
||||||
let compression = self.conf.image_compression;
|
let compression = self.conf.image_compression;
|
||||||
self.uncompressed_bytes += img.len() as u64;
|
let uncompressed_len = img.len() as u64;
|
||||||
|
self.uncompressed_bytes += uncompressed_len;
|
||||||
|
self.num_keys += 1;
|
||||||
let (_img, res) = self
|
let (_img, res) = self
|
||||||
.blob_writer
|
.blob_writer
|
||||||
.write_blob_maybe_compressed(img, ctx, compression)
|
.write_blob_maybe_compressed(img, ctx, compression)
|
||||||
.await;
|
.await;
|
||||||
// TODO: re-use the buffer for `img` further upstack
|
// TODO: re-use the buffer for `img` further upstack
|
||||||
let off = res?;
|
let (off, compression_info) = res?;
|
||||||
|
if compression_info.compressed_size.is_some() {
|
||||||
|
// The image has been considered for compression at least
|
||||||
|
self.uncompressed_bytes_eligible += uncompressed_len;
|
||||||
|
}
|
||||||
|
if compression_info.written_compressed {
|
||||||
|
// The image has been compressed
|
||||||
|
self.uncompressed_bytes_chosen += uncompressed_len;
|
||||||
|
}
|
||||||
|
|
||||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||||
key.write_to_byte_slice(&mut keybuf);
|
key.write_to_byte_slice(&mut keybuf);
|
||||||
self.tree.append(&keybuf, off)?;
|
self.tree.append(&keybuf, off)?;
|
||||||
|
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
{
|
||||||
|
self.last_written_key = key;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -830,6 +863,7 @@ impl ImageLayerWriterInner {
|
|||||||
self,
|
self,
|
||||||
timeline: &Arc<Timeline>,
|
timeline: &Arc<Timeline>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
|
end_key: Option<Key>,
|
||||||
) -> anyhow::Result<ResidentLayer> {
|
) -> anyhow::Result<ResidentLayer> {
|
||||||
let index_start_blk =
|
let index_start_blk =
|
||||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||||
@@ -837,6 +871,9 @@ impl ImageLayerWriterInner {
|
|||||||
// Calculate compression ratio
|
// Calculate compression ratio
|
||||||
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
|
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
|
||||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
|
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
|
||||||
|
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
|
||||||
|
.inc_by(self.uncompressed_bytes_eligible);
|
||||||
|
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
|
||||||
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
|
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
|
||||||
|
|
||||||
let mut file = self.blob_writer.into_inner();
|
let mut file = self.blob_writer.into_inner();
|
||||||
@@ -877,11 +914,23 @@ impl ImageLayerWriterInner {
|
|||||||
let desc = PersistentLayerDesc::new_img(
|
let desc = PersistentLayerDesc::new_img(
|
||||||
self.tenant_shard_id,
|
self.tenant_shard_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.key_range.clone(),
|
if let Some(end_key) = end_key {
|
||||||
|
self.key_range.start..end_key
|
||||||
|
} else {
|
||||||
|
self.key_range.clone()
|
||||||
|
},
|
||||||
self.lsn,
|
self.lsn,
|
||||||
metadata.len(),
|
metadata.len(),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
if let Some(end_key) = end_key {
|
||||||
|
assert!(
|
||||||
|
self.last_written_key < end_key,
|
||||||
|
"written key violates end_key range"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Note: Because we open the file in write-only mode, we cannot
|
// Note: Because we open the file in write-only mode, we cannot
|
||||||
// reuse the same VirtualFile for reading later. That's why we don't
|
// reuse the same VirtualFile for reading later. That's why we don't
|
||||||
// set inner.file here. The first read will have to re-open it.
|
// set inner.file here. The first read will have to re-open it.
|
||||||
@@ -958,6 +1007,18 @@ impl ImageLayerWriter {
|
|||||||
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
|
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
/// Estimated size of the image layer.
|
||||||
|
pub(crate) fn estimated_size(&self) -> u64 {
|
||||||
|
let inner = self.inner.as_ref().unwrap();
|
||||||
|
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) fn num_keys(&self) -> usize {
|
||||||
|
self.inner.as_ref().unwrap().num_keys
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Finish writing the image layer.
|
/// Finish writing the image layer.
|
||||||
///
|
///
|
||||||
@@ -966,7 +1027,22 @@ impl ImageLayerWriter {
|
|||||||
timeline: &Arc<Timeline>,
|
timeline: &Arc<Timeline>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<super::ResidentLayer> {
|
) -> anyhow::Result<super::ResidentLayer> {
|
||||||
self.inner.take().unwrap().finish(timeline, ctx).await
|
self.inner.take().unwrap().finish(timeline, ctx, None).await
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
||||||
|
pub(super) async fn finish_with_end_key(
|
||||||
|
mut self,
|
||||||
|
timeline: &Arc<Timeline>,
|
||||||
|
end_key: Key,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> anyhow::Result<super::ResidentLayer> {
|
||||||
|
self.inner
|
||||||
|
.take()
|
||||||
|
.unwrap()
|
||||||
|
.finish(timeline, ctx, Some(end_key))
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,8 @@ use super::delta_layer::{self, DeltaEntry};
|
|||||||
use super::image_layer::{self};
|
use super::image_layer::{self};
|
||||||
use super::{
|
use super::{
|
||||||
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
|
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
|
||||||
PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
|
LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
|
||||||
|
ValuesReconstructState,
|
||||||
};
|
};
|
||||||
|
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
@@ -246,7 +247,7 @@ impl Layer {
|
|||||||
&timeline.generation,
|
&timeline.generation,
|
||||||
);
|
);
|
||||||
|
|
||||||
let layer = LayerInner::new(
|
LayerInner::new(
|
||||||
conf,
|
conf,
|
||||||
timeline,
|
timeline,
|
||||||
local_path,
|
local_path,
|
||||||
@@ -254,14 +255,7 @@ impl Layer {
|
|||||||
Some(inner),
|
Some(inner),
|
||||||
timeline.generation,
|
timeline.generation,
|
||||||
timeline.get_shard_index(),
|
timeline.get_shard_index(),
|
||||||
);
|
)
|
||||||
|
|
||||||
// Newly created layers are marked visible by default: the usual case is that they were created to be read.
|
|
||||||
layer
|
|
||||||
.access_stats
|
|
||||||
.set_visibility(super::LayerVisibilityHint::Visible);
|
|
||||||
|
|
||||||
layer
|
|
||||||
}));
|
}));
|
||||||
|
|
||||||
let downloaded = resident.expect("just initialized");
|
let downloaded = resident.expect("just initialized");
|
||||||
@@ -493,6 +487,32 @@ impl Layer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
|
||||||
|
let old_visibility = self.access_stats().set_visibility(visibility.clone());
|
||||||
|
use LayerVisibilityHint::*;
|
||||||
|
match (old_visibility, visibility) {
|
||||||
|
(Visible, Covered) => {
|
||||||
|
// Subtract this layer's contribution to the visible size metric
|
||||||
|
if let Some(tl) = self.0.timeline.upgrade() {
|
||||||
|
tl.metrics
|
||||||
|
.visible_physical_size_gauge
|
||||||
|
.sub(self.0.desc.file_size)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(Covered, Visible) => {
|
||||||
|
// Add this layer's contribution to the visible size metric
|
||||||
|
if let Some(tl) = self.0.timeline.upgrade() {
|
||||||
|
tl.metrics
|
||||||
|
.visible_physical_size_gauge
|
||||||
|
.add(self.0.desc.file_size)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(Covered, Covered) | (Visible, Visible) => {
|
||||||
|
// no change
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
|
/// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
|
||||||
@@ -693,6 +713,13 @@ impl Drop for LayerInner {
|
|||||||
timeline.metrics.layer_count_image.dec();
|
timeline.metrics.layer_count_image.dec();
|
||||||
timeline.metrics.layer_size_image.sub(self.desc.file_size);
|
timeline.metrics.layer_size_image.sub(self.desc.file_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
|
||||||
|
timeline
|
||||||
|
.metrics
|
||||||
|
.visible_physical_size_gauge
|
||||||
|
.sub(self.desc.file_size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !*self.wanted_deleted.get_mut() {
|
if !*self.wanted_deleted.get_mut() {
|
||||||
@@ -801,6 +828,12 @@ impl LayerInner {
|
|||||||
timeline.metrics.layer_size_image.add(desc.file_size);
|
timeline.metrics.layer_size_image.add(desc.file_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// New layers are visible by default. This metric is later updated on drop or in set_visibility
|
||||||
|
timeline
|
||||||
|
.metrics
|
||||||
|
.visible_physical_size_gauge
|
||||||
|
.add(desc.file_size);
|
||||||
|
|
||||||
LayerInner {
|
LayerInner {
|
||||||
conf,
|
conf,
|
||||||
debug_str: {
|
debug_str: {
|
||||||
|
|||||||
@@ -41,6 +41,20 @@ pub struct PersistentLayerKey {
|
|||||||
pub is_delta: bool,
|
pub is_delta: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for PersistentLayerKey {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"{}..{} {}..{} is_delta={}",
|
||||||
|
self.key_range.start,
|
||||||
|
self.key_range.end,
|
||||||
|
self.lsn_range.start,
|
||||||
|
self.lsn_range.end,
|
||||||
|
self.is_delta
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl PersistentLayerDesc {
|
impl PersistentLayerDesc {
|
||||||
pub fn key(&self) -> PersistentLayerKey {
|
pub fn key(&self) -> PersistentLayerKey {
|
||||||
PersistentLayerKey {
|
PersistentLayerKey {
|
||||||
|
|||||||
244
pageserver/src/tenant/storage_layer/split_writer.rs
Normal file
244
pageserver/src/tenant/storage_layer/split_writer.rs
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use bytes::Bytes;
|
||||||
|
use pageserver_api::key::{Key, KEY_SIZE};
|
||||||
|
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
|
||||||
|
|
||||||
|
use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
|
||||||
|
|
||||||
|
use super::{ImageLayerWriter, ResidentLayer};
|
||||||
|
|
||||||
|
/// An image writer that takes images and produces multiple image layers. The interface does not
|
||||||
|
/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
|
||||||
|
/// to be cleaned up)
|
||||||
|
#[must_use]
|
||||||
|
pub struct SplitImageLayerWriter {
|
||||||
|
inner: ImageLayerWriter,
|
||||||
|
target_layer_size: u64,
|
||||||
|
generated_layers: Vec<ResidentLayer>,
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
lsn: Lsn,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SplitImageLayerWriter {
|
||||||
|
pub async fn new(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
start_key: Key,
|
||||||
|
lsn: Lsn,
|
||||||
|
target_layer_size: u64,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> anyhow::Result<Self> {
|
||||||
|
Ok(Self {
|
||||||
|
target_layer_size,
|
||||||
|
inner: ImageLayerWriter::new(
|
||||||
|
conf,
|
||||||
|
timeline_id,
|
||||||
|
tenant_shard_id,
|
||||||
|
&(start_key..Key::MAX),
|
||||||
|
lsn,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?,
|
||||||
|
generated_layers: Vec::new(),
|
||||||
|
conf,
|
||||||
|
timeline_id,
|
||||||
|
tenant_shard_id,
|
||||||
|
lsn,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn put_image(
|
||||||
|
&mut self,
|
||||||
|
key: Key,
|
||||||
|
img: Bytes,
|
||||||
|
tline: &Arc<Timeline>,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
// The current estimation is an upper bound of the space that the key/image could take
|
||||||
|
// because we did not consider compression in this estimation. The resulting image layer
|
||||||
|
// could be smaller than the target size.
|
||||||
|
let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
|
||||||
|
if self.inner.num_keys() >= 1
|
||||||
|
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||||
|
{
|
||||||
|
let next_image_writer = ImageLayerWriter::new(
|
||||||
|
self.conf,
|
||||||
|
self.timeline_id,
|
||||||
|
self.tenant_shard_id,
|
||||||
|
&(key..Key::MAX),
|
||||||
|
self.lsn,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
|
||||||
|
self.generated_layers.push(
|
||||||
|
prev_image_writer
|
||||||
|
.finish_with_end_key(tline, key, ctx)
|
||||||
|
.await?,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
self.inner.put_image(key, img, ctx).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn finish(
|
||||||
|
self,
|
||||||
|
tline: &Arc<Timeline>,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
end_key: Key,
|
||||||
|
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||||
|
let Self {
|
||||||
|
mut generated_layers,
|
||||||
|
inner,
|
||||||
|
..
|
||||||
|
} = self;
|
||||||
|
generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
|
||||||
|
Ok(generated_layers)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::{
|
||||||
|
tenant::{
|
||||||
|
harness::{TenantHarness, TIMELINE_ID},
|
||||||
|
storage_layer::AsLayerDesc,
|
||||||
|
},
|
||||||
|
DEFAULT_PG_VERSION,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn get_key(id: u32) -> Key {
|
||||||
|
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||||
|
key.field6 = id;
|
||||||
|
key
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_img(id: u32) -> Bytes {
|
||||||
|
format!("{id:064}").into()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_large_img() -> Bytes {
|
||||||
|
vec![0; 8192].into()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn write_one_image() {
|
||||||
|
let harness = TenantHarness::create("split_writer_write_one_image")
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
|
let tline = tenant
|
||||||
|
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let mut writer = SplitImageLayerWriter::new(
|
||||||
|
tenant.conf,
|
||||||
|
tline.timeline_id,
|
||||||
|
tenant.tenant_shard_id,
|
||||||
|
get_key(0),
|
||||||
|
Lsn(0x18),
|
||||||
|
4 * 1024 * 1024,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.put_image(get_key(0), get_img(0), &tline, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
|
||||||
|
assert_eq!(layers.len(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn write_split() {
|
||||||
|
let harness = TenantHarness::create("split_writer_write_split")
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
|
let tline = tenant
|
||||||
|
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let mut writer = SplitImageLayerWriter::new(
|
||||||
|
tenant.conf,
|
||||||
|
tline.timeline_id,
|
||||||
|
tenant.tenant_shard_id,
|
||||||
|
get_key(0),
|
||||||
|
Lsn(0x18),
|
||||||
|
4 * 1024 * 1024,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
const N: usize = 2000;
|
||||||
|
for i in 0..N {
|
||||||
|
let i = i as u32;
|
||||||
|
writer
|
||||||
|
.put_image(get_key(i), get_large_img(), &tline, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
let layers = writer
|
||||||
|
.finish(&tline, &ctx, get_key(N as u32))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(layers.len(), N / 512 + 1);
|
||||||
|
for idx in 0..layers.len() {
|
||||||
|
assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||||
|
assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||||
|
if idx > 0 {
|
||||||
|
assert_eq!(
|
||||||
|
layers[idx - 1].layer_desc().key_range.end,
|
||||||
|
layers[idx].layer_desc().key_range.start
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn write_large_img() {
|
||||||
|
let harness = TenantHarness::create("split_writer_write_large_img")
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
|
let tline = tenant
|
||||||
|
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let mut writer = SplitImageLayerWriter::new(
|
||||||
|
tenant.conf,
|
||||||
|
tline.timeline_id,
|
||||||
|
tenant.tenant_shard_id,
|
||||||
|
get_key(0),
|
||||||
|
Lsn(0x18),
|
||||||
|
4 * 1024,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.put_image(get_key(0), get_img(0), &tline, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
writer
|
||||||
|
.put_image(get_key(1), get_large_img(), &tline, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
|
||||||
|
assert_eq!(layers.len(), 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,6 +3,7 @@ pub(crate) mod compaction;
|
|||||||
pub mod delete;
|
pub mod delete;
|
||||||
pub(crate) mod detach_ancestor;
|
pub(crate) mod detach_ancestor;
|
||||||
mod eviction_task;
|
mod eviction_task;
|
||||||
|
pub(crate) mod handle;
|
||||||
mod init;
|
mod init;
|
||||||
pub mod layer_manager;
|
pub mod layer_manager;
|
||||||
pub(crate) mod logical_size;
|
pub(crate) mod logical_size;
|
||||||
@@ -17,6 +18,7 @@ use camino::Utf8Path;
|
|||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use enumset::EnumSet;
|
use enumset::EnumSet;
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
|
use handle::ShardTimelineId;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
key::{
|
key::{
|
||||||
@@ -58,7 +60,7 @@ use std::{
|
|||||||
sync::atomic::AtomicU64,
|
sync::atomic::AtomicU64,
|
||||||
};
|
};
|
||||||
use std::{
|
use std::{
|
||||||
cmp::{max, min, Ordering},
|
cmp::{max, min},
|
||||||
ops::ControlFlow,
|
ops::ControlFlow,
|
||||||
};
|
};
|
||||||
use std::{
|
use std::{
|
||||||
@@ -74,6 +76,7 @@ use crate::{
|
|||||||
metadata::TimelineMetadata,
|
metadata::TimelineMetadata,
|
||||||
storage_layer::PersistentLayerDesc,
|
storage_layer::PersistentLayerDesc,
|
||||||
},
|
},
|
||||||
|
walredo,
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
context::{DownloadBehavior, RequestContext},
|
context::{DownloadBehavior, RequestContext},
|
||||||
@@ -140,7 +143,10 @@ use self::walreceiver::{WalReceiver, WalReceiverConf};
|
|||||||
use super::{config::TenantConf, upload_queue::NotInitialized};
|
use super::{config::TenantConf, upload_queue::NotInitialized};
|
||||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||||
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
|
use super::{
|
||||||
|
remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
|
||||||
|
storage_layer::ReadableLayer,
|
||||||
|
};
|
||||||
use super::{
|
use super::{
|
||||||
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
|
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
|
||||||
GcError,
|
GcError,
|
||||||
@@ -177,25 +183,6 @@ impl std::fmt::Display for ImageLayerCreationMode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
||||||
pub(crate) struct Hole {
|
|
||||||
key_range: Range<Key>,
|
|
||||||
coverage_size: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Ord for Hole {
|
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
|
||||||
other.coverage_size.cmp(&self.coverage_size) // inverse order
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartialOrd for Hole {
|
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
|
||||||
Some(self.cmp(other))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
||||||
/// Can be removed after all refactors are done.
|
/// Can be removed after all refactors are done.
|
||||||
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
|
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
|
||||||
@@ -443,6 +430,8 @@ pub struct Timeline {
|
|||||||
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
|
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
|
||||||
|
|
||||||
pub(crate) l0_flush_global_state: L0FlushGlobalState,
|
pub(crate) l0_flush_global_state: L0FlushGlobalState,
|
||||||
|
|
||||||
|
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct WalReceiverInfo {
|
pub struct WalReceiverInfo {
|
||||||
@@ -548,7 +537,6 @@ impl GetVectoredError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct MissingKeyError {
|
pub struct MissingKeyError {
|
||||||
key: Key,
|
key: Key,
|
||||||
shard: ShardNumber,
|
shard: ShardNumber,
|
||||||
@@ -559,6 +547,12 @@ pub struct MissingKeyError {
|
|||||||
backtrace: Option<std::backtrace::Backtrace>,
|
backtrace: Option<std::backtrace::Backtrace>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for MissingKeyError {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "{}", self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for MissingKeyError {
|
impl std::fmt::Display for MissingKeyError {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
write!(
|
write!(
|
||||||
@@ -1010,7 +1004,10 @@ impl Timeline {
|
|||||||
.for_get_kind(GetKind::Singular)
|
.for_get_kind(GetKind::Singular)
|
||||||
.observe(elapsed.as_secs_f64());
|
.observe(elapsed.as_secs_f64());
|
||||||
|
|
||||||
if cfg!(feature = "testing") && res.is_err() {
|
if cfg!(feature = "testing")
|
||||||
|
&& res.is_err()
|
||||||
|
&& !matches!(res, Err(PageReconstructError::Cancelled))
|
||||||
|
{
|
||||||
// it can only be walredo issue
|
// it can only be walredo issue
|
||||||
use std::fmt::Write;
|
use std::fmt::Write;
|
||||||
|
|
||||||
@@ -1929,6 +1926,9 @@ impl Timeline {
|
|||||||
tracing::debug!("Cancelling CancellationToken");
|
tracing::debug!("Cancelling CancellationToken");
|
||||||
self.cancel.cancel();
|
self.cancel.cancel();
|
||||||
|
|
||||||
|
// Ensure Prevent new page service requests from starting.
|
||||||
|
self.handles.shutdown();
|
||||||
|
|
||||||
// Transition the remote_client into a state where it's only useful for timeline deletion.
|
// Transition the remote_client into a state where it's only useful for timeline deletion.
|
||||||
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
|
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
|
||||||
self.remote_client.stop();
|
self.remote_client.stop();
|
||||||
@@ -2454,6 +2454,8 @@ impl Timeline {
|
|||||||
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
|
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
|
||||||
|
|
||||||
l0_flush_global_state: resources.l0_flush_global_state,
|
l0_flush_global_state: resources.l0_flush_global_state,
|
||||||
|
|
||||||
|
handles: Default::default(),
|
||||||
};
|
};
|
||||||
result.repartition_threshold =
|
result.repartition_threshold =
|
||||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||||
@@ -2737,6 +2739,10 @@ impl Timeline {
|
|||||||
// Tenant::create_timeline will wait for these uploads to happen before returning, or
|
// Tenant::create_timeline will wait for these uploads to happen before returning, or
|
||||||
// on retry.
|
// on retry.
|
||||||
|
|
||||||
|
// Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
|
||||||
|
drop(guard); // drop write lock, update_layer_visibility will take a read lock.
|
||||||
|
self.update_layer_visibility().await;
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"loaded layer map with {} layers at {}, total physical size: {}",
|
"loaded layer map with {} layers at {}, total physical size: {}",
|
||||||
num_layers, disk_consistent_lsn, total_physical_size
|
num_layers, disk_consistent_lsn, total_physical_size
|
||||||
@@ -3723,6 +3729,17 @@ impl Timeline {
|
|||||||
&self.shard_identity
|
&self.shard_identity
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
|
||||||
|
ShardTimelineId {
|
||||||
|
shard_index: ShardIndex {
|
||||||
|
shard_number: self.shard_identity.number,
|
||||||
|
shard_count: self.shard_identity.count,
|
||||||
|
},
|
||||||
|
timeline_id: self.timeline_id,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Get a handle to the latest layer for appending.
|
/// Get a handle to the latest layer for appending.
|
||||||
///
|
///
|
||||||
@@ -4075,6 +4092,21 @@ impl Timeline {
|
|||||||
// release lock on 'layers'
|
// release lock on 'layers'
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
|
||||||
|
// This makes us refuse ingest until the new layers have been persisted to the remote.
|
||||||
|
self.remote_client
|
||||||
|
.wait_completion()
|
||||||
|
.await
|
||||||
|
.map_err(|e| match e {
|
||||||
|
WaitCompletionError::UploadQueueShutDownOrStopped
|
||||||
|
| WaitCompletionError::NotInitialized(
|
||||||
|
NotInitialized::ShuttingDown | NotInitialized::Stopped,
|
||||||
|
) => FlushLayerError::Cancelled,
|
||||||
|
WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
|
||||||
|
FlushLayerError::Other(anyhow!(e).into())
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
||||||
// a compaction can delete the file and then it won't be available for uploads any more.
|
// a compaction can delete the file and then it won't be available for uploads any more.
|
||||||
// We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
|
// We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
|
||||||
@@ -4667,27 +4699,6 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// The writer.finish() above already did the fsync of the inodes.
|
|
||||||
// We just need to fsync the directory in which these inodes are linked,
|
|
||||||
// which we know to be the timeline directory.
|
|
||||||
if !image_layers.is_empty() {
|
|
||||||
// We use fatal_err() below because the after writer.finish() returns with success,
|
|
||||||
// the in-memory state of the filesystem already has the layer file in its final place,
|
|
||||||
// and subsequent pageserver code could think it's durable while it really isn't.
|
|
||||||
let timeline_dir = VirtualFile::open(
|
|
||||||
&self
|
|
||||||
.conf
|
|
||||||
.timeline_path(&self.tenant_shard_id, &self.timeline_id),
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.fatal_err("VirtualFile::open for timeline dir fsync");
|
|
||||||
timeline_dir
|
|
||||||
.sync_all()
|
|
||||||
.await
|
|
||||||
.fatal_err("VirtualFile::sync_all timeline dir");
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
|
|
||||||
// FIXME: we could add the images to be uploaded *before* returning from here, but right
|
// FIXME: we could add the images to be uploaded *before* returning from here, but right
|
||||||
@@ -4696,6 +4707,9 @@ impl Timeline {
|
|||||||
drop_wlock(guard);
|
drop_wlock(guard);
|
||||||
timer.stop_and_record();
|
timer.stop_and_record();
|
||||||
|
|
||||||
|
// Creating image layers may have caused some previously visible layers to be covered
|
||||||
|
self.update_layer_visibility().await;
|
||||||
|
|
||||||
Ok(image_layers)
|
Ok(image_layers)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -5460,20 +5474,22 @@ impl Timeline {
|
|||||||
} else {
|
} else {
|
||||||
trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
|
trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
|
||||||
};
|
};
|
||||||
|
let res = self
|
||||||
let img = match self
|
|
||||||
.walredo_mgr
|
.walredo_mgr
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.context("timeline has no walredo manager")
|
.context("timeline has no walredo manager")
|
||||||
.map_err(PageReconstructError::WalRedo)?
|
.map_err(PageReconstructError::WalRedo)?
|
||||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||||
.await
|
.await;
|
||||||
.context("reconstruct a page image")
|
let img = match res {
|
||||||
{
|
|
||||||
Ok(img) => img,
|
Ok(img) => img,
|
||||||
Err(e) => return Err(PageReconstructError::WalRedo(e)),
|
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
|
||||||
|
Err(walredo::Error::Other(e)) => {
|
||||||
|
return Err(PageReconstructError::WalRedo(
|
||||||
|
e.context("reconstruct a page image"),
|
||||||
|
))
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(img)
|
Ok(img)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -63,10 +63,19 @@ pub(super) async fn delete_local_timeline_directory(
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
|
// Always ensure the lock order is compaction -> gc.
|
||||||
let guards = crate::timed(
|
let compaction_lock = timeline.compaction_lock.lock();
|
||||||
guards,
|
let compaction_lock = crate::timed(
|
||||||
"acquire gc and compaction locks",
|
compaction_lock,
|
||||||
|
"acquires compaction lock",
|
||||||
|
std::time::Duration::from_secs(5),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let gc_lock = timeline.gc_lock.lock();
|
||||||
|
let gc_lock = crate::timed(
|
||||||
|
gc_lock,
|
||||||
|
"acquires gc lock",
|
||||||
std::time::Duration::from_secs(5),
|
std::time::Duration::from_secs(5),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
@@ -107,7 +116,8 @@ pub(super) async fn delete_local_timeline_directory(
|
|||||||
.context("fsync_pre_mark_remove")?;
|
.context("fsync_pre_mark_remove")?;
|
||||||
|
|
||||||
info!("finished deleting layer files, releasing locks");
|
info!("finished deleting layer files, releasing locks");
|
||||||
drop(guards);
|
drop(gc_lock);
|
||||||
|
drop(compaction_lock);
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||||
@@ -206,11 +216,10 @@ impl DeleteTimelineFlow {
|
|||||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||||
// error out if some of the shutdown tasks have already been completed!
|
// error out if some of the shutdown tasks have already been completed!
|
||||||
#[instrument(skip_all, fields(%inplace))]
|
#[instrument(skip_all)]
|
||||||
pub async fn run(
|
pub async fn run(
|
||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
inplace: bool,
|
|
||||||
) -> Result<(), DeleteTimelineError> {
|
) -> Result<(), DeleteTimelineError> {
|
||||||
super::debug_assert_current_span_has_tenant_and_timeline_id();
|
super::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
@@ -235,11 +244,7 @@ impl DeleteTimelineFlow {
|
|||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
if inplace {
|
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||||
Self::background(guard, tenant.conf, tenant, &timeline).await?
|
|
||||||
} else {
|
|
||||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
967
pageserver/src/tenant/timeline/handle.rs
Normal file
967
pageserver/src/tenant/timeline/handle.rs
Normal file
@@ -0,0 +1,967 @@
|
|||||||
|
//! An efficient way to keep the timeline gate open without preventing
|
||||||
|
//! timeline shutdown for longer than a single call to a timeline method.
|
||||||
|
//!
|
||||||
|
//! # Motivation
|
||||||
|
//!
|
||||||
|
//! On a single page service connection, we're typically serving a single TenantTimelineId.
|
||||||
|
//!
|
||||||
|
//! Without sharding, there is a single Timeline object to which we dispatch
|
||||||
|
//! all requests. For example, a getpage request gets dispatched to the
|
||||||
|
//! Timeline::get method of the Timeline object that represents the
|
||||||
|
//! (tenant,timeline) of that connection.
|
||||||
|
//!
|
||||||
|
//! With sharding, for each request that comes in on the connection,
|
||||||
|
//! we first have to perform shard routing based on the requested key (=~ page number).
|
||||||
|
//! The result of shard routing is a Timeline object.
|
||||||
|
//! We then dispatch the request to that Timeline object.
|
||||||
|
//!
|
||||||
|
//! Regardless of whether the tenant is sharded or not, we want to ensure that
|
||||||
|
//! we hold the Timeline gate open while we're invoking the method on the
|
||||||
|
//! Timeline object.
|
||||||
|
//!
|
||||||
|
//! However, we want to avoid the overhead of entering the gate for every
|
||||||
|
//! method invocation.
|
||||||
|
//!
|
||||||
|
//! Further, for shard routing, we want to avoid calling the tenant manager to
|
||||||
|
//! resolve the shard for every request. Instead, we want to cache the
|
||||||
|
//! routing result so we can bypass the tenant manager for all subsequent requests
|
||||||
|
//! that get routed to that shard.
|
||||||
|
//!
|
||||||
|
//! Regardless of how we accomplish the above, it should not
|
||||||
|
//! prevent the Timeline from shutting down promptly.
|
||||||
|
//!
|
||||||
|
//! # Design
|
||||||
|
//!
|
||||||
|
//! There are three user-facing data structures:
|
||||||
|
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
|
||||||
|
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
|
||||||
|
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
|
||||||
|
//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
|
||||||
|
//!
|
||||||
|
//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
|
||||||
|
//!
|
||||||
|
//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
|
||||||
|
//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
|
||||||
|
//!
|
||||||
|
//! To dispatch a request, the page service connection calls `Cache::get`.
|
||||||
|
//!
|
||||||
|
//! A cache miss means we consult the tenant manager for shard routing,
|
||||||
|
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
|
||||||
|
//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
|
||||||
|
//! and the `Arc<HandleInner>` in the `PerTimelineState`.
|
||||||
|
//!
|
||||||
|
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
|
||||||
|
//! and find the `Weak<HandleInner>` in the cache.
|
||||||
|
//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
|
||||||
|
//!
|
||||||
|
//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||||
|
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
|
||||||
|
//!
|
||||||
|
//! # Memory Management / How The Reference Cycle Is Broken
|
||||||
|
//!
|
||||||
|
//! The attentive reader may have noticed the strong reference cycle
|
||||||
|
//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
|
||||||
|
//!
|
||||||
|
//! This cycle is intentional: while it exists, the `Cache` can upgrade its
|
||||||
|
//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
|
||||||
|
//!
|
||||||
|
//! The cycle is broken by either
|
||||||
|
//! - `PerTimelineState::shutdown` or
|
||||||
|
//! - dropping the `Cache`.
|
||||||
|
//!
|
||||||
|
//! Concurrently existing `Handle`s will extend the existence of the cycle.
|
||||||
|
//! However, since `Handle`s are short-lived and new `Handle`s are not
|
||||||
|
//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
|
||||||
|
//! that extension of the cycle is bounded.
|
||||||
|
//!
|
||||||
|
//! # Fast Path for Shard Routing
|
||||||
|
//!
|
||||||
|
//! The `Cache` has a fast path for shard routing to avoid calling into
|
||||||
|
//! the tenant manager for every request.
|
||||||
|
//!
|
||||||
|
//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
|
||||||
|
//!
|
||||||
|
//! The current implementation uses the first entry in the hash map
|
||||||
|
//! to determine the `ShardParameters` and derive the correct
|
||||||
|
//! `ShardIndex` for the requested key.
|
||||||
|
//!
|
||||||
|
//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
|
||||||
|
//!
|
||||||
|
//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
|
||||||
|
//! it's a hit.
|
||||||
|
//!
|
||||||
|
//! ## Cache invalidation
|
||||||
|
//!
|
||||||
|
//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
|
||||||
|
//! The only reasons why an entry in the cache can become stale are:
|
||||||
|
//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
|
||||||
|
//! being detached, timeline or shard deleted, or pageserver is shutting down.
|
||||||
|
//! 2. We're doing a shard split and new traffic should be routed to the child shards.
|
||||||
|
//!
|
||||||
|
//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
|
||||||
|
//! timeline has shut down, and when that happens, we remove the entry from the cache.
|
||||||
|
//!
|
||||||
|
//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
|
||||||
|
//! to the parent shard during a shard split. Eventually, the shard split task will
|
||||||
|
//! shut down the parent => case (1).
|
||||||
|
|
||||||
|
use std::collections::hash_map;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::atomic::AtomicBool;
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::Mutex;
|
||||||
|
use std::sync::Weak;
|
||||||
|
|
||||||
|
use pageserver_api::shard::ShardIdentity;
|
||||||
|
use tracing::instrument;
|
||||||
|
use tracing::trace;
|
||||||
|
use utils::id::TimelineId;
|
||||||
|
use utils::shard::ShardIndex;
|
||||||
|
use utils::shard::ShardNumber;
|
||||||
|
|
||||||
|
use crate::tenant::mgr::ShardSelector;
|
||||||
|
|
||||||
|
/// The requirement for Debug is so that #[derive(Debug)] works in some places.
|
||||||
|
pub(crate) trait Types: Sized + std::fmt::Debug {
|
||||||
|
type TenantManagerError: Sized + std::fmt::Debug;
|
||||||
|
type TenantManager: TenantManager<Self> + Sized;
|
||||||
|
type Timeline: ArcTimeline<Self> + Sized;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
|
||||||
|
/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
|
||||||
|
/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
|
||||||
|
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
|
||||||
|
struct CacheId(u64);
|
||||||
|
|
||||||
|
impl CacheId {
|
||||||
|
fn next() -> Self {
|
||||||
|
static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
|
||||||
|
let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||||
|
if id == 0 {
|
||||||
|
panic!("CacheId::new() returned 0, overflow");
|
||||||
|
}
|
||||||
|
Self(id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// See module-level comment.
|
||||||
|
pub(crate) struct Cache<T: Types> {
|
||||||
|
id: CacheId,
|
||||||
|
map: Map<T>,
|
||||||
|
}
|
||||||
|
|
||||||
|
type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
|
||||||
|
|
||||||
|
impl<T: Types> Default for Cache<T> {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
id: CacheId::next(),
|
||||||
|
map: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
|
||||||
|
pub(crate) struct ShardTimelineId {
|
||||||
|
pub(crate) shard_index: ShardIndex,
|
||||||
|
pub(crate) timeline_id: TimelineId,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// See module-level comment.
|
||||||
|
pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
|
||||||
|
struct HandleInner<T: Types> {
|
||||||
|
shut_down: AtomicBool,
|
||||||
|
timeline: T::Timeline,
|
||||||
|
// The timeline's gate held open.
|
||||||
|
_gate_guard: utils::sync::gate::GateGuard,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
|
||||||
|
///
|
||||||
|
/// See module-level comment for details.
|
||||||
|
pub struct PerTimelineState<T: Types> {
|
||||||
|
// None = shutting down
|
||||||
|
handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Types> Default for PerTimelineState<T> {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
handles: Mutex::new(Some(Default::default())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Abstract view of [`crate::tenant::mgr`], for testability.
|
||||||
|
pub(crate) trait TenantManager<T: Types> {
|
||||||
|
/// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
|
||||||
|
/// Errors are returned as [`GetError::TenantManager`].
|
||||||
|
async fn resolve(
|
||||||
|
&self,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
shard_selector: ShardSelector,
|
||||||
|
) -> Result<T::Timeline, T::TenantManagerError>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Abstract view of an [`Arc<Timeline>`], for testability.
|
||||||
|
pub(crate) trait ArcTimeline<T: Types>: Clone {
|
||||||
|
fn gate(&self) -> &utils::sync::gate::Gate;
|
||||||
|
fn shard_timeline_id(&self) -> ShardTimelineId;
|
||||||
|
fn get_shard_identity(&self) -> &ShardIdentity;
|
||||||
|
fn per_timeline_state(&self) -> &PerTimelineState<T>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Errors returned by [`Cache::get`].
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub(crate) enum GetError<T: Types> {
|
||||||
|
TenantManager(T::TenantManagerError),
|
||||||
|
TimelineGateClosed,
|
||||||
|
PerTimelineStateShutDown,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal type used in [`Cache::get`].
|
||||||
|
enum RoutingResult<T: Types> {
|
||||||
|
FastPath(Handle<T>),
|
||||||
|
SlowPath(ShardTimelineId),
|
||||||
|
NeedConsultTenantManager,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Types> Cache<T> {
|
||||||
|
/// See module-level comment for details.
|
||||||
|
///
|
||||||
|
/// Does NOT check for the shutdown state of [`Types::Timeline`].
|
||||||
|
/// Instead, the methods of [`Types::Timeline`] that are invoked through
|
||||||
|
/// the [`Handle`] are responsible for checking these conditions
|
||||||
|
/// and if so, return an error that causes the page service to
|
||||||
|
/// close the connection.
|
||||||
|
#[instrument(level = "trace", skip_all)]
|
||||||
|
pub(crate) async fn get(
|
||||||
|
&mut self,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
shard_selector: ShardSelector,
|
||||||
|
tenant_manager: &T::TenantManager,
|
||||||
|
) -> Result<Handle<T>, GetError<T>> {
|
||||||
|
// terminates because each iteration removes an element from the map
|
||||||
|
loop {
|
||||||
|
let handle = self
|
||||||
|
.get_impl(timeline_id, shard_selector, tenant_manager)
|
||||||
|
.await?;
|
||||||
|
if handle.0.shut_down.load(Ordering::Relaxed) {
|
||||||
|
let removed = self
|
||||||
|
.map
|
||||||
|
.remove(&handle.0.timeline.shard_timeline_id())
|
||||||
|
.expect("invariant of get_impl is that the returned handle is in the map");
|
||||||
|
assert!(
|
||||||
|
Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
|
||||||
|
"shard_timeline_id() incorrect?"
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
return Ok(handle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[instrument(level = "trace", skip_all)]
|
||||||
|
async fn get_impl(
|
||||||
|
&mut self,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
shard_selector: ShardSelector,
|
||||||
|
tenant_manager: &T::TenantManager,
|
||||||
|
) -> Result<Handle<T>, GetError<T>> {
|
||||||
|
let miss: ShardSelector = {
|
||||||
|
let routing_state = self.shard_routing(timeline_id, shard_selector);
|
||||||
|
match routing_state {
|
||||||
|
RoutingResult::FastPath(handle) => return Ok(handle),
|
||||||
|
RoutingResult::SlowPath(key) => match self.map.get(&key) {
|
||||||
|
Some(cached) => match cached.upgrade() {
|
||||||
|
Some(upgraded) => return Ok(Handle(upgraded)),
|
||||||
|
None => {
|
||||||
|
trace!("handle cache stale");
|
||||||
|
self.map.remove(&key).unwrap();
|
||||||
|
ShardSelector::Known(key.shard_index)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
None => ShardSelector::Known(key.shard_index),
|
||||||
|
},
|
||||||
|
RoutingResult::NeedConsultTenantManager => shard_selector,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
self.get_miss(timeline_id, miss, tenant_manager).await
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn shard_routing(
|
||||||
|
&mut self,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
shard_selector: ShardSelector,
|
||||||
|
) -> RoutingResult<T> {
|
||||||
|
loop {
|
||||||
|
// terminates because when every iteration we remove an element from the map
|
||||||
|
let Some((first_key, first_handle)) = self.map.iter().next() else {
|
||||||
|
return RoutingResult::NeedConsultTenantManager;
|
||||||
|
};
|
||||||
|
let Some(first_handle) = first_handle.upgrade() else {
|
||||||
|
// TODO: dedup with get()
|
||||||
|
trace!("handle cache stale");
|
||||||
|
let first_key_owned = *first_key;
|
||||||
|
self.map.remove(&first_key_owned).unwrap();
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
|
||||||
|
let make_shard_index = |shard_num: ShardNumber| ShardIndex {
|
||||||
|
shard_number: shard_num,
|
||||||
|
shard_count: first_handle_shard_identity.count,
|
||||||
|
};
|
||||||
|
|
||||||
|
let need_idx = match shard_selector {
|
||||||
|
ShardSelector::Page(key) => {
|
||||||
|
make_shard_index(first_handle_shard_identity.get_shard_number(&key))
|
||||||
|
}
|
||||||
|
ShardSelector::Zero => make_shard_index(ShardNumber(0)),
|
||||||
|
ShardSelector::Known(shard_idx) => shard_idx,
|
||||||
|
};
|
||||||
|
let need_shard_timeline_id = ShardTimelineId {
|
||||||
|
shard_index: need_idx,
|
||||||
|
timeline_id,
|
||||||
|
};
|
||||||
|
let first_handle_shard_timeline_id = ShardTimelineId {
|
||||||
|
shard_index: first_handle_shard_identity.shard_index(),
|
||||||
|
timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
|
||||||
|
};
|
||||||
|
|
||||||
|
if need_shard_timeline_id == first_handle_shard_timeline_id {
|
||||||
|
return RoutingResult::FastPath(Handle(first_handle));
|
||||||
|
} else {
|
||||||
|
return RoutingResult::SlowPath(need_shard_timeline_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[instrument(level = "trace", skip_all)]
|
||||||
|
#[inline(always)]
|
||||||
|
async fn get_miss(
|
||||||
|
&mut self,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
shard_selector: ShardSelector,
|
||||||
|
tenant_manager: &T::TenantManager,
|
||||||
|
) -> Result<Handle<T>, GetError<T>> {
|
||||||
|
match tenant_manager.resolve(timeline_id, shard_selector).await {
|
||||||
|
Ok(timeline) => {
|
||||||
|
let key = timeline.shard_timeline_id();
|
||||||
|
match &shard_selector {
|
||||||
|
ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
|
||||||
|
ShardSelector::Page(_) => (), // gotta trust tenant_manager
|
||||||
|
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||||
|
}
|
||||||
|
|
||||||
|
let gate_guard = match timeline.gate().enter() {
|
||||||
|
Ok(guard) => guard,
|
||||||
|
Err(_) => {
|
||||||
|
return Err(GetError::TimelineGateClosed);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
trace!("creating new HandleInner");
|
||||||
|
let handle = Arc::new(
|
||||||
|
// TODO: global metric that keeps track of the number of live HandlerTimeline instances
|
||||||
|
// so we can identify reference cycle bugs.
|
||||||
|
HandleInner {
|
||||||
|
shut_down: AtomicBool::new(false),
|
||||||
|
_gate_guard: gate_guard,
|
||||||
|
timeline: timeline.clone(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
let handle = {
|
||||||
|
let mut lock_guard = timeline
|
||||||
|
.per_timeline_state()
|
||||||
|
.handles
|
||||||
|
.lock()
|
||||||
|
.expect("mutex poisoned");
|
||||||
|
match &mut *lock_guard {
|
||||||
|
Some(per_timeline_state) => {
|
||||||
|
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
|
||||||
|
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||||
|
match self.map.entry(key) {
|
||||||
|
hash_map::Entry::Occupied(_o) => {
|
||||||
|
// This cannot not happen because
|
||||||
|
// 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
|
||||||
|
// 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
|
||||||
|
// while we were waiting for the tenant manager.
|
||||||
|
unreachable!()
|
||||||
|
}
|
||||||
|
hash_map::Entry::Vacant(v) => {
|
||||||
|
v.insert(Arc::downgrade(&handle));
|
||||||
|
handle
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
return Err(GetError::PerTimelineStateShutDown);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Ok(Handle(handle))
|
||||||
|
}
|
||||||
|
Err(e) => Err(GetError::TenantManager(e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Types> PerTimelineState<T> {
|
||||||
|
/// After this method returns, [`Cache::get`] will never again return a [`Handle`]
|
||||||
|
/// to the [`Types::Timeline`] that embeds this per-timeline state.
|
||||||
|
/// Even if [`TenantManager::resolve`] would still resolve to it.
|
||||||
|
///
|
||||||
|
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
|
||||||
|
/// That's ok because they're short-lived. See module-level comment for details.
|
||||||
|
#[instrument(level = "trace", skip_all)]
|
||||||
|
pub(super) fn shutdown(&self) {
|
||||||
|
let handles = self
|
||||||
|
.handles
|
||||||
|
.lock()
|
||||||
|
.expect("mutex poisoned")
|
||||||
|
// NB: this .take() sets locked to None.
|
||||||
|
// That's what makes future `Cache::get` misses fail.
|
||||||
|
// Cache hits are taken care of below.
|
||||||
|
.take();
|
||||||
|
let Some(handles) = handles else {
|
||||||
|
trace!("already shut down");
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
for handle in handles.values() {
|
||||||
|
// Make hits fail.
|
||||||
|
handle.shut_down.store(true, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
drop(handles);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||||
|
type Target = T::Timeline;
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
&self.0.timeline
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
impl<T: Types> Drop for HandleInner<T> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
trace!("HandleInner dropped");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
|
||||||
|
impl<T: Types> Drop for Cache<T> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
for (_, weak) in self.map.drain() {
|
||||||
|
if let Some(strong) = weak.upgrade() {
|
||||||
|
// handle is still being kept alive in PerTimelineState
|
||||||
|
let timeline = strong.timeline.per_timeline_state();
|
||||||
|
let mut handles = timeline.handles.lock().expect("mutex poisoned");
|
||||||
|
if let Some(handles) = &mut *handles {
|
||||||
|
let Some(removed) = handles.remove(&self.id) else {
|
||||||
|
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
assert!(Arc::ptr_eq(&removed, &strong));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use pageserver_api::{
|
||||||
|
key::{rel_block_to_key, Key, DBDIR_KEY},
|
||||||
|
models::ShardParameters,
|
||||||
|
reltag::RelTag,
|
||||||
|
shard::ShardStripeSize,
|
||||||
|
};
|
||||||
|
use utils::shard::ShardCount;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct TestTypes;
|
||||||
|
impl Types for TestTypes {
|
||||||
|
type TenantManagerError = anyhow::Error;
|
||||||
|
type TenantManager = StubManager;
|
||||||
|
type Timeline = Arc<StubTimeline>;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct StubManager {
|
||||||
|
shards: Vec<Arc<StubTimeline>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct StubTimeline {
|
||||||
|
gate: utils::sync::gate::Gate,
|
||||||
|
id: TimelineId,
|
||||||
|
shard: ShardIdentity,
|
||||||
|
per_timeline_state: PerTimelineState<TestTypes>,
|
||||||
|
myself: Weak<StubTimeline>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StubTimeline {
|
||||||
|
fn getpage(&self) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
|
||||||
|
fn gate(&self) -> &utils::sync::gate::Gate {
|
||||||
|
&self.gate
|
||||||
|
}
|
||||||
|
|
||||||
|
fn shard_timeline_id(&self) -> ShardTimelineId {
|
||||||
|
ShardTimelineId {
|
||||||
|
shard_index: self.shard.shard_index(),
|
||||||
|
timeline_id: self.id,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_shard_identity(&self) -> &ShardIdentity {
|
||||||
|
&self.shard
|
||||||
|
}
|
||||||
|
|
||||||
|
fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
|
||||||
|
&self.per_timeline_state
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TenantManager<TestTypes> for StubManager {
|
||||||
|
async fn resolve(
|
||||||
|
&self,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
shard_selector: ShardSelector,
|
||||||
|
) -> anyhow::Result<Arc<StubTimeline>> {
|
||||||
|
for timeline in &self.shards {
|
||||||
|
if timeline.id == timeline_id {
|
||||||
|
match &shard_selector {
|
||||||
|
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
|
||||||
|
return Ok(Arc::clone(timeline));
|
||||||
|
}
|
||||||
|
ShardSelector::Zero => continue,
|
||||||
|
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
|
||||||
|
return Ok(Arc::clone(timeline));
|
||||||
|
}
|
||||||
|
ShardSelector::Page(_) => continue,
|
||||||
|
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
|
||||||
|
return Ok(Arc::clone(timeline));
|
||||||
|
}
|
||||||
|
ShardSelector::Known(_) => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
anyhow::bail!("not found")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn test_timeline_shutdown() {
|
||||||
|
crate::tenant::harness::setup_logging();
|
||||||
|
|
||||||
|
let timeline_id = TimelineId::generate();
|
||||||
|
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||||
|
gate: Default::default(),
|
||||||
|
id: timeline_id,
|
||||||
|
shard: ShardIdentity::unsharded(),
|
||||||
|
per_timeline_state: PerTimelineState::default(),
|
||||||
|
myself: myself.clone(),
|
||||||
|
});
|
||||||
|
let mgr = StubManager {
|
||||||
|
shards: vec![shard0.clone()],
|
||||||
|
};
|
||||||
|
let key = DBDIR_KEY;
|
||||||
|
|
||||||
|
let mut cache = Cache::<TestTypes>::default();
|
||||||
|
|
||||||
|
//
|
||||||
|
// fill the cache
|
||||||
|
//
|
||||||
|
assert_eq!(
|
||||||
|
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||||
|
(2, 1),
|
||||||
|
"strong: shard0, mgr; weak: myself"
|
||||||
|
);
|
||||||
|
|
||||||
|
let handle: Handle<_> = cache
|
||||||
|
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||||
|
.await
|
||||||
|
.expect("we have the timeline");
|
||||||
|
let handle_inner_weak = Arc::downgrade(&handle.0);
|
||||||
|
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||||
|
assert_eq!(
|
||||||
|
(
|
||||||
|
Weak::strong_count(&handle_inner_weak),
|
||||||
|
Weak::weak_count(&handle_inner_weak)
|
||||||
|
),
|
||||||
|
(2, 2),
|
||||||
|
"strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
|
||||||
|
);
|
||||||
|
assert_eq!(cache.map.len(), 1);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||||
|
(3, 1),
|
||||||
|
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||||
|
);
|
||||||
|
drop(handle);
|
||||||
|
assert_eq!(
|
||||||
|
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||||
|
(3, 1),
|
||||||
|
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||||
|
);
|
||||||
|
|
||||||
|
//
|
||||||
|
// demonstrate that Handle holds up gate closure
|
||||||
|
// but shutdown prevents new handles from being handed out
|
||||||
|
//
|
||||||
|
|
||||||
|
tokio::select! {
|
||||||
|
_ = shard0.gate.close() => {
|
||||||
|
panic!("cache and per-timeline handler state keep cache open");
|
||||||
|
}
|
||||||
|
_ = tokio::time::sleep(FOREVER) => {
|
||||||
|
// NB: first poll of close() makes it enter closing state
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let handle = cache
|
||||||
|
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||||
|
.await
|
||||||
|
.expect("we have the timeline");
|
||||||
|
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||||
|
|
||||||
|
// SHUTDOWN
|
||||||
|
shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
1,
|
||||||
|
Weak::strong_count(&handle_inner_weak),
|
||||||
|
"through local var handle"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
cache.map.len(),
|
||||||
|
1,
|
||||||
|
"this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||||
|
(3, 1),
|
||||||
|
"strong: handleinner(via handle), shard0, mgr; weak: myself"
|
||||||
|
);
|
||||||
|
|
||||||
|
// this handle is perfectly usable
|
||||||
|
handle.getpage();
|
||||||
|
|
||||||
|
cache
|
||||||
|
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||||
|
.await
|
||||||
|
.err()
|
||||||
|
.expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
|
||||||
|
assert_eq!(
|
||||||
|
cache.map.len(),
|
||||||
|
0,
|
||||||
|
"first access after shutdown cleans up the Weak's from the cache"
|
||||||
|
);
|
||||||
|
|
||||||
|
tokio::select! {
|
||||||
|
_ = shard0.gate.close() => {
|
||||||
|
panic!("handle is keeping gate open");
|
||||||
|
}
|
||||||
|
_ = tokio::time::sleep(FOREVER) => { }
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(handle);
|
||||||
|
assert_eq!(
|
||||||
|
0,
|
||||||
|
Weak::strong_count(&handle_inner_weak),
|
||||||
|
"the HandleInner destructor already ran"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||||
|
(2, 1),
|
||||||
|
"strong: shard0, mgr; weak: myself"
|
||||||
|
);
|
||||||
|
|
||||||
|
// closing gate succeeds after dropping handle
|
||||||
|
tokio::select! {
|
||||||
|
_ = shard0.gate.close() => { }
|
||||||
|
_ = tokio::time::sleep(FOREVER) => {
|
||||||
|
panic!("handle is dropped, no other gate holders exist")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// map gets cleaned on next lookup
|
||||||
|
cache
|
||||||
|
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||||
|
.await
|
||||||
|
.err()
|
||||||
|
.expect("documented behavior: can't get new handle after shutdown");
|
||||||
|
assert_eq!(cache.map.len(), 0);
|
||||||
|
|
||||||
|
// ensure all refs to shard0 are gone and we're not leaking anything
|
||||||
|
let myself = Weak::clone(&shard0.myself);
|
||||||
|
drop(shard0);
|
||||||
|
drop(mgr);
|
||||||
|
assert_eq!(Weak::strong_count(&myself), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_multiple_timelines_and_deletion() {
|
||||||
|
crate::tenant::harness::setup_logging();
|
||||||
|
|
||||||
|
let timeline_a = TimelineId::generate();
|
||||||
|
let timeline_b = TimelineId::generate();
|
||||||
|
assert_ne!(timeline_a, timeline_b);
|
||||||
|
let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
|
||||||
|
gate: Default::default(),
|
||||||
|
id: timeline_a,
|
||||||
|
shard: ShardIdentity::unsharded(),
|
||||||
|
per_timeline_state: PerTimelineState::default(),
|
||||||
|
myself: myself.clone(),
|
||||||
|
});
|
||||||
|
let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
|
||||||
|
gate: Default::default(),
|
||||||
|
id: timeline_b,
|
||||||
|
shard: ShardIdentity::unsharded(),
|
||||||
|
per_timeline_state: PerTimelineState::default(),
|
||||||
|
myself: myself.clone(),
|
||||||
|
});
|
||||||
|
let mut mgr = StubManager {
|
||||||
|
shards: vec![timeline_a.clone(), timeline_b.clone()],
|
||||||
|
};
|
||||||
|
let key = DBDIR_KEY;
|
||||||
|
|
||||||
|
let mut cache = Cache::<TestTypes>::default();
|
||||||
|
|
||||||
|
cache
|
||||||
|
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
|
||||||
|
.await
|
||||||
|
.expect("we have it");
|
||||||
|
cache
|
||||||
|
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
|
||||||
|
.await
|
||||||
|
.expect("we have it");
|
||||||
|
assert_eq!(cache.map.len(), 2);
|
||||||
|
|
||||||
|
// delete timeline A
|
||||||
|
timeline_a.per_timeline_state.shutdown();
|
||||||
|
mgr.shards.retain(|t| t.id != timeline_a.id);
|
||||||
|
assert!(
|
||||||
|
mgr.resolve(timeline_a.id, ShardSelector::Page(key))
|
||||||
|
.await
|
||||||
|
.is_err(),
|
||||||
|
"broken StubManager implementation"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
cache.map.len(),
|
||||||
|
2,
|
||||||
|
"cache still has a Weak handle to Timeline A"
|
||||||
|
);
|
||||||
|
cache
|
||||||
|
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
|
||||||
|
.await
|
||||||
|
.err()
|
||||||
|
.expect("documented behavior: can't get new handle after shutdown");
|
||||||
|
assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
|
||||||
|
|
||||||
|
cache
|
||||||
|
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
|
||||||
|
.await
|
||||||
|
.expect("we still have it");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
|
||||||
|
rel_block_to_key(
|
||||||
|
RelTag {
|
||||||
|
spcnode: 1663,
|
||||||
|
dbnode: 208101,
|
||||||
|
relnode: 2620,
|
||||||
|
forknum: 0,
|
||||||
|
},
|
||||||
|
shard.0 as u32 * params.stripe_size.0,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn test_shard_split() {
|
||||||
|
crate::tenant::harness::setup_logging();
|
||||||
|
let timeline_id = TimelineId::generate();
|
||||||
|
let parent = Arc::new_cyclic(|myself| StubTimeline {
|
||||||
|
gate: Default::default(),
|
||||||
|
id: timeline_id,
|
||||||
|
shard: ShardIdentity::unsharded(),
|
||||||
|
per_timeline_state: PerTimelineState::default(),
|
||||||
|
myself: myself.clone(),
|
||||||
|
});
|
||||||
|
let child_params = ShardParameters {
|
||||||
|
count: ShardCount(2),
|
||||||
|
stripe_size: ShardStripeSize::default(),
|
||||||
|
};
|
||||||
|
let child0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||||
|
gate: Default::default(),
|
||||||
|
id: timeline_id,
|
||||||
|
shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
|
||||||
|
per_timeline_state: PerTimelineState::default(),
|
||||||
|
myself: myself.clone(),
|
||||||
|
});
|
||||||
|
let child1 = Arc::new_cyclic(|myself| StubTimeline {
|
||||||
|
gate: Default::default(),
|
||||||
|
id: timeline_id,
|
||||||
|
shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
|
||||||
|
per_timeline_state: PerTimelineState::default(),
|
||||||
|
myself: myself.clone(),
|
||||||
|
});
|
||||||
|
let child_shards_by_shard_number = [child0.clone(), child1.clone()];
|
||||||
|
|
||||||
|
let mut cache = Cache::<TestTypes>::default();
|
||||||
|
|
||||||
|
// fill the cache with the parent
|
||||||
|
for i in 0..2 {
|
||||||
|
let handle = cache
|
||||||
|
.get(
|
||||||
|
timeline_id,
|
||||||
|
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||||
|
&StubManager {
|
||||||
|
shards: vec![parent.clone()],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("we have it");
|
||||||
|
assert!(
|
||||||
|
Weak::ptr_eq(&handle.myself, &parent.myself),
|
||||||
|
"mgr returns parent first"
|
||||||
|
);
|
||||||
|
drop(handle);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// SHARD SPLIT: tenant manager changes, but the cache isn't informed
|
||||||
|
//
|
||||||
|
|
||||||
|
// while we haven't shut down the parent, the cache will return the cached parent, even
|
||||||
|
// if the tenant manager returns the child
|
||||||
|
for i in 0..2 {
|
||||||
|
let handle = cache
|
||||||
|
.get(
|
||||||
|
timeline_id,
|
||||||
|
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||||
|
&StubManager {
|
||||||
|
shards: vec![], // doesn't matter what's in here, the cache is fully loaded
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("we have it");
|
||||||
|
assert!(
|
||||||
|
Weak::ptr_eq(&handle.myself, &parent.myself),
|
||||||
|
"mgr returns parent"
|
||||||
|
);
|
||||||
|
drop(handle);
|
||||||
|
}
|
||||||
|
|
||||||
|
let parent_handle = cache
|
||||||
|
.get(
|
||||||
|
timeline_id,
|
||||||
|
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
|
||||||
|
&StubManager {
|
||||||
|
shards: vec![parent.clone()],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("we have it");
|
||||||
|
assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
|
||||||
|
|
||||||
|
// invalidate the cache
|
||||||
|
parent.per_timeline_state.shutdown();
|
||||||
|
|
||||||
|
// the cache will now return the child, even though the parent handle still exists
|
||||||
|
for i in 0..2 {
|
||||||
|
let handle = cache
|
||||||
|
.get(
|
||||||
|
timeline_id,
|
||||||
|
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||||
|
&StubManager {
|
||||||
|
shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("we have it");
|
||||||
|
assert!(
|
||||||
|
Weak::ptr_eq(
|
||||||
|
&handle.myself,
|
||||||
|
&child_shards_by_shard_number[i as usize].myself
|
||||||
|
),
|
||||||
|
"mgr returns child"
|
||||||
|
);
|
||||||
|
drop(handle);
|
||||||
|
}
|
||||||
|
|
||||||
|
// all the while the parent handle kept the parent gate open
|
||||||
|
tokio::select! {
|
||||||
|
_ = parent_handle.gate.close() => {
|
||||||
|
panic!("parent handle is keeping gate open");
|
||||||
|
}
|
||||||
|
_ = tokio::time::sleep(FOREVER) => { }
|
||||||
|
}
|
||||||
|
drop(parent_handle);
|
||||||
|
tokio::select! {
|
||||||
|
_ = parent.gate.close() => { }
|
||||||
|
_ = tokio::time::sleep(FOREVER) => {
|
||||||
|
panic!("parent handle is dropped, no other gate holders exist")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn test_connection_handler_exit() {
|
||||||
|
crate::tenant::harness::setup_logging();
|
||||||
|
let timeline_id = TimelineId::generate();
|
||||||
|
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||||
|
gate: Default::default(),
|
||||||
|
id: timeline_id,
|
||||||
|
shard: ShardIdentity::unsharded(),
|
||||||
|
per_timeline_state: PerTimelineState::default(),
|
||||||
|
myself: myself.clone(),
|
||||||
|
});
|
||||||
|
let mgr = StubManager {
|
||||||
|
shards: vec![shard0.clone()],
|
||||||
|
};
|
||||||
|
let key = DBDIR_KEY;
|
||||||
|
|
||||||
|
// Simulate 10 connections that's opened, used, and closed
|
||||||
|
let mut used_handles = vec![];
|
||||||
|
for _ in 0..10 {
|
||||||
|
let mut cache = Cache::<TestTypes>::default();
|
||||||
|
let handle = {
|
||||||
|
let handle = cache
|
||||||
|
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||||
|
.await
|
||||||
|
.expect("we have the timeline");
|
||||||
|
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||||
|
handle
|
||||||
|
};
|
||||||
|
handle.getpage();
|
||||||
|
used_handles.push(Arc::downgrade(&handle.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// No handles exist, thus gates are closed and don't require shutdown
|
||||||
|
assert!(used_handles
|
||||||
|
.iter()
|
||||||
|
.all(|weak| Weak::strong_count(weak) == 0));
|
||||||
|
|
||||||
|
// ... thus the gate should close immediately, even without shutdown
|
||||||
|
tokio::select! {
|
||||||
|
_ = shard0.gate.close() => { }
|
||||||
|
_ = tokio::time::sleep(FOREVER) => {
|
||||||
|
panic!("handle is dropped, no other gate holders exist")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -35,6 +35,10 @@ impl LayerManager {
|
|||||||
self.layer_fmgr.get_from_desc(desc)
|
self.layer_fmgr.get_from_desc(desc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
|
||||||
|
self.layer_fmgr.get_from_key(desc)
|
||||||
|
}
|
||||||
|
|
||||||
/// Get an immutable reference to the layer map.
|
/// Get an immutable reference to the layer map.
|
||||||
///
|
///
|
||||||
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
|
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
|
||||||
@@ -365,16 +369,20 @@ impl<T> Default for LayerFileManager<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
|
fn get_from_key(&self, key: &PersistentLayerKey) -> T {
|
||||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||||
self.0
|
self.0
|
||||||
.get(&desc.key())
|
.get(key)
|
||||||
.with_context(|| format!("get layer from desc: {}", desc.layer_name()))
|
.with_context(|| format!("get layer from key: {}", key))
|
||||||
.expect("not found")
|
.expect("not found")
|
||||||
.clone()
|
.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
|
||||||
|
self.get_from_key(&desc.key())
|
||||||
|
}
|
||||||
|
|
||||||
fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||||
self.0.contains_key(key)
|
self.0.contains_key(key)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -241,6 +241,9 @@ impl PostgresRedoManager {
|
|||||||
|
|
||||||
/// Shut down the WAL redo manager.
|
/// Shut down the WAL redo manager.
|
||||||
///
|
///
|
||||||
|
/// Returns `true` if this call was the one that initiated shutdown.
|
||||||
|
/// `true` may be observed by no caller if the first caller stops polling.
|
||||||
|
///
|
||||||
/// After this future completes
|
/// After this future completes
|
||||||
/// - no redo process is running
|
/// - no redo process is running
|
||||||
/// - no new redo process will be spawned
|
/// - no new redo process will be spawned
|
||||||
@@ -250,22 +253,32 @@ impl PostgresRedoManager {
|
|||||||
/// # Cancel-Safety
|
/// # Cancel-Safety
|
||||||
///
|
///
|
||||||
/// This method is cancellation-safe.
|
/// This method is cancellation-safe.
|
||||||
pub async fn shutdown(&self) {
|
pub async fn shutdown(&self) -> bool {
|
||||||
// prevent new processes from being spawned
|
// prevent new processes from being spawned
|
||||||
let permit = match self.redo_process.get_or_init_detached().await {
|
let maybe_permit = match self.redo_process.get_or_init_detached().await {
|
||||||
Ok(guard) => {
|
Ok(guard) => {
|
||||||
let (proc, permit) = guard.take_and_deinit();
|
if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
|
||||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
None
|
||||||
permit
|
} else {
|
||||||
|
let (proc, permit) = guard.take_and_deinit();
|
||||||
|
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||||
|
Some(permit)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Err(permit) => permit,
|
Err(permit) => Some(permit),
|
||||||
|
};
|
||||||
|
let it_was_us = if let Some(permit) = maybe_permit {
|
||||||
|
self.redo_process
|
||||||
|
.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
};
|
};
|
||||||
self.redo_process
|
|
||||||
.set(ProcessOnceCell::ManagerShutDown, permit);
|
|
||||||
// wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
|
// wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
|
||||||
// we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
|
// we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
|
||||||
// for the underlying process.
|
// for the underlying process.
|
||||||
self.launched_processes.close().await;
|
self.launched_processes.close().await;
|
||||||
|
it_was_us
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This type doesn't have its own background task to check for idleness: we
|
/// This type doesn't have its own background task to check for idleness: we
|
||||||
|
|||||||
7
pageserver/test_data/indices/mixed_workload/README.md
Normal file
7
pageserver/test_data/indices/mixed_workload/README.md
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
|
||||||
|
# This was captured from one shard of a large tenant in staging.
|
||||||
|
|
||||||
|
# It has a mixture of deltas and image layers, >1000 layers in total.
|
||||||
|
|
||||||
|
# This is suitable for general smoke tests that want an index which is not
|
||||||
|
# trivially small, but doesn't contain weird/pathological cases.
|
||||||
File diff suppressed because one or more lines are too long
153
poetry.lock
generated
153
poetry.lock
generated
@@ -870,6 +870,96 @@ files = [
|
|||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clickhouse-connect"
|
||||||
|
version = "0.7.17"
|
||||||
|
description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
|
||||||
|
optional = false
|
||||||
|
python-versions = "~=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
|
||||||
|
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
certifi = "*"
|
||||||
|
lz4 = "*"
|
||||||
|
pytz = "*"
|
||||||
|
urllib3 = ">=1.26"
|
||||||
|
zstandard = "*"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
arrow = ["pyarrow"]
|
||||||
|
numpy = ["numpy"]
|
||||||
|
orjson = ["orjson"]
|
||||||
|
pandas = ["pandas"]
|
||||||
|
sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
|
||||||
|
tzlocal = ["tzlocal (>=4.0)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "colorama"
|
name = "colorama"
|
||||||
version = "0.4.5"
|
version = "0.4.5"
|
||||||
@@ -1470,6 +1560,56 @@ files = [
|
|||||||
{file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
|
{file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lz4"
|
||||||
|
version = "4.3.3"
|
||||||
|
description = "LZ4 Bindings for Python"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
|
||||||
|
{file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
|
||||||
|
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
|
||||||
|
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
|
||||||
|
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
|
||||||
|
{file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
|
||||||
|
{file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
|
||||||
|
{file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
|
||||||
|
{file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
|
||||||
|
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
|
||||||
|
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
|
||||||
|
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
|
||||||
|
{file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
|
||||||
|
{file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
|
||||||
|
{file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
|
||||||
|
{file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
|
||||||
|
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
|
||||||
|
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
|
||||||
|
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
|
||||||
|
{file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
|
||||||
|
{file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
|
||||||
|
{file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
|
||||||
|
{file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
|
||||||
|
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
|
||||||
|
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
|
||||||
|
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
|
||||||
|
{file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
|
||||||
|
{file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
|
||||||
|
{file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
|
||||||
|
{file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
|
||||||
|
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
|
||||||
|
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
|
||||||
|
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
|
||||||
|
{file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
|
||||||
|
{file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
|
||||||
|
{file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
|
||||||
|
flake8 = ["flake8"]
|
||||||
|
tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "markupsafe"
|
name = "markupsafe"
|
||||||
version = "2.1.1"
|
version = "2.1.1"
|
||||||
@@ -2361,6 +2501,17 @@ files = [
|
|||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
six = ">=1.5"
|
six = ">=1.5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pytz"
|
||||||
|
version = "2024.1"
|
||||||
|
description = "World timezone definitions, modern and historical"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
|
||||||
|
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pywin32"
|
name = "pywin32"
|
||||||
version = "301"
|
version = "301"
|
||||||
@@ -3206,4 +3357,4 @@ cffi = ["cffi (>=1.11)"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
|
content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ zstandard = "^0.21.0"
|
|||||||
httpx = {extras = ["http2"], version = "^0.26.0"}
|
httpx = {extras = ["http2"], version = "^0.26.0"}
|
||||||
pytest-repeat = "^0.9.3"
|
pytest-repeat = "^0.9.3"
|
||||||
websockets = "^12.0"
|
websockets = "^12.0"
|
||||||
|
clickhouse-connect = "^0.7.16"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
mypy = "==1.3.0"
|
mypy = "==1.3.0"
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ pub mod json_ctrl;
|
|||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
pub mod patch_control_file;
|
pub mod patch_control_file;
|
||||||
pub mod pull_timeline;
|
pub mod pull_timeline;
|
||||||
|
pub mod rate_limit;
|
||||||
pub mod receive_wal;
|
pub mod receive_wal;
|
||||||
pub mod recovery;
|
pub mod recovery;
|
||||||
pub mod remove_wal;
|
pub mod remove_wal;
|
||||||
@@ -53,6 +54,7 @@ pub mod defaults {
|
|||||||
pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
|
pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
|
||||||
pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
|
pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
|
||||||
pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
|
pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
|
||||||
|
pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2;
|
||||||
|
|
||||||
// By default, our required residency before eviction is the same as the period that passes
|
// By default, our required residency before eviction is the same as the period that passes
|
||||||
// before uploading a partial segment, so that in normal operation the eviction can happen
|
// before uploading a partial segment, so that in normal operation the eviction can happen
|
||||||
|
|||||||
49
safekeeper/src/rate_limit.rs
Normal file
49
safekeeper/src/rate_limit.rs
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use rand::Rng;
|
||||||
|
|
||||||
|
use crate::metrics::MISC_OPERATION_SECONDS;
|
||||||
|
|
||||||
|
/// Global rate limiter for background tasks.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct RateLimiter {
|
||||||
|
partial_backup: Arc<tokio::sync::Semaphore>,
|
||||||
|
eviction: Arc<tokio::sync::Semaphore>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RateLimiter {
|
||||||
|
/// Create a new rate limiter.
|
||||||
|
/// - `partial_backup_max`: maximum number of concurrent partial backups.
|
||||||
|
/// - `eviction_max`: maximum number of concurrent timeline evictions.
|
||||||
|
pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)),
|
||||||
|
eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a permit for partial backup. This will block if the maximum number of concurrent
|
||||||
|
/// partial backups is reached.
|
||||||
|
pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit {
|
||||||
|
let _timer = MISC_OPERATION_SECONDS
|
||||||
|
.with_label_values(&["partial_permit_acquire"])
|
||||||
|
.start_timer();
|
||||||
|
self.partial_backup
|
||||||
|
.clone()
|
||||||
|
.acquire_owned()
|
||||||
|
.await
|
||||||
|
.expect("semaphore is closed")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Try to get a permit for timeline eviction. This will return None if the maximum number of
|
||||||
|
/// concurrent timeline evictions is reached.
|
||||||
|
pub fn try_acquire_eviction(&self) -> Option<tokio::sync::OwnedSemaphorePermit> {
|
||||||
|
self.eviction.clone().try_acquire_owned().ok()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a random duration that is a fraction of the given duration.
|
||||||
|
pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration {
|
||||||
|
let randf64 = rand::thread_rng().gen_range(0.0..1.0);
|
||||||
|
duration.mul_f64(randf64)
|
||||||
|
}
|
||||||
@@ -25,6 +25,7 @@ use utils::{
|
|||||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||||
|
|
||||||
|
use crate::rate_limit::RateLimiter;
|
||||||
use crate::receive_wal::WalReceivers;
|
use crate::receive_wal::WalReceivers;
|
||||||
use crate::safekeeper::{
|
use crate::safekeeper::{
|
||||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
|
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
|
||||||
@@ -36,7 +37,7 @@ use crate::timeline_guard::ResidenceGuard;
|
|||||||
use crate::timeline_manager::{AtomicStatus, ManagerCtl};
|
use crate::timeline_manager::{AtomicStatus, ManagerCtl};
|
||||||
use crate::timelines_set::TimelinesSet;
|
use crate::timelines_set::TimelinesSet;
|
||||||
use crate::wal_backup::{self};
|
use crate::wal_backup::{self};
|
||||||
use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
|
use crate::wal_backup_partial::PartialRemoteSegment;
|
||||||
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||||
|
|
||||||
use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
|
use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
|
||||||
|
|||||||
@@ -5,7 +5,6 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use remote_storage::RemotePath;
|
use remote_storage::RemotePath;
|
||||||
use std::time::Instant;
|
|
||||||
use tokio::{
|
use tokio::{
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{AsyncRead, AsyncWriteExt},
|
io::{AsyncRead, AsyncWriteExt},
|
||||||
@@ -15,6 +14,7 @@ use utils::crashsafe::durable_rename;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
|
metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
|
||||||
|
rate_limit::rand_duration,
|
||||||
timeline_manager::{Manager, StateSnapshot},
|
timeline_manager::{Manager, StateSnapshot},
|
||||||
wal_backup,
|
wal_backup,
|
||||||
wal_backup_partial::{self, PartialRemoteSegment},
|
wal_backup_partial::{self, PartialRemoteSegment},
|
||||||
@@ -50,7 +50,6 @@ impl Manager {
|
|||||||
.flush_lsn
|
.flush_lsn
|
||||||
.segment_number(self.wal_seg_size)
|
.segment_number(self.wal_seg_size)
|
||||||
== self.last_removed_segno + 1
|
== self.last_removed_segno + 1
|
||||||
&& self.resident_since.elapsed() >= self.conf.eviction_min_resident
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Evict the timeline to remote storage.
|
/// Evict the timeline to remote storage.
|
||||||
@@ -112,7 +111,8 @@ impl Manager {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.resident_since = Instant::now();
|
self.evict_not_before =
|
||||||
|
tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
|
||||||
|
|
||||||
info!("successfully restored evicted timeline");
|
info!("successfully restored evicted timeline");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ use utils::lsn::Lsn;
|
|||||||
use crate::{
|
use crate::{
|
||||||
control_file::{FileStorage, Storage},
|
control_file::{FileStorage, Storage},
|
||||||
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
|
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
|
||||||
|
rate_limit::{rand_duration, RateLimiter},
|
||||||
recovery::recovery_main,
|
recovery::recovery_main,
|
||||||
remove_wal::calc_horizon_lsn,
|
remove_wal::calc_horizon_lsn,
|
||||||
safekeeper::Term,
|
safekeeper::Term,
|
||||||
@@ -32,7 +33,7 @@ use crate::{
|
|||||||
timeline_guard::{AccessService, GuardId, ResidenceGuard},
|
timeline_guard::{AccessService, GuardId, ResidenceGuard},
|
||||||
timelines_set::{TimelineSetGuard, TimelinesSet},
|
timelines_set::{TimelineSetGuard, TimelinesSet},
|
||||||
wal_backup::{self, WalBackupTaskHandle},
|
wal_backup::{self, WalBackupTaskHandle},
|
||||||
wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
|
wal_backup_partial::{self, PartialRemoteSegment},
|
||||||
SafeKeeperConf,
|
SafeKeeperConf,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -185,11 +186,11 @@ pub(crate) struct Manager {
|
|||||||
|
|
||||||
// misc
|
// misc
|
||||||
pub(crate) access_service: AccessService,
|
pub(crate) access_service: AccessService,
|
||||||
pub(crate) partial_backup_rate_limiter: RateLimiter,
|
pub(crate) global_rate_limiter: RateLimiter,
|
||||||
|
|
||||||
// Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
|
// Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
|
||||||
// evict them if they go inactive very soon after being restored.
|
// evict them if they go inactive very soon after being restored.
|
||||||
pub(crate) resident_since: std::time::Instant,
|
pub(crate) evict_not_before: Instant,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
|
/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
|
||||||
@@ -202,7 +203,7 @@ pub async fn main_task(
|
|||||||
broker_active_set: Arc<TimelinesSet>,
|
broker_active_set: Arc<TimelinesSet>,
|
||||||
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
|
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
|
||||||
mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
|
mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
|
||||||
partial_backup_rate_limiter: RateLimiter,
|
global_rate_limiter: RateLimiter,
|
||||||
) {
|
) {
|
||||||
tli.set_status(Status::Started);
|
tli.set_status(Status::Started);
|
||||||
|
|
||||||
@@ -220,7 +221,7 @@ pub async fn main_task(
|
|||||||
conf,
|
conf,
|
||||||
broker_active_set,
|
broker_active_set,
|
||||||
manager_tx,
|
manager_tx,
|
||||||
partial_backup_rate_limiter,
|
global_rate_limiter,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -254,9 +255,29 @@ pub async fn main_task(
|
|||||||
mgr.set_status(Status::UpdatePartialBackup);
|
mgr.set_status(Status::UpdatePartialBackup);
|
||||||
mgr.update_partial_backup(&state_snapshot).await;
|
mgr.update_partial_backup(&state_snapshot).await;
|
||||||
|
|
||||||
if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) {
|
let now = Instant::now();
|
||||||
mgr.set_status(Status::EvictTimeline);
|
if mgr.evict_not_before > now {
|
||||||
mgr.evict_timeline().await;
|
// we should wait until evict_not_before
|
||||||
|
update_next_event(&mut next_event, mgr.evict_not_before);
|
||||||
|
}
|
||||||
|
|
||||||
|
if mgr.conf.enable_offload
|
||||||
|
&& mgr.evict_not_before <= now
|
||||||
|
&& mgr.ready_for_eviction(&next_event, &state_snapshot)
|
||||||
|
{
|
||||||
|
// check rate limiter and evict timeline if possible
|
||||||
|
match mgr.global_rate_limiter.try_acquire_eviction() {
|
||||||
|
Some(_permit) => {
|
||||||
|
mgr.set_status(Status::EvictTimeline);
|
||||||
|
mgr.evict_timeline().await;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// we can't evict timeline now, will try again later
|
||||||
|
mgr.evict_not_before =
|
||||||
|
Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
|
||||||
|
update_next_event(&mut next_event, mgr.evict_not_before);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -334,11 +355,10 @@ impl Manager {
|
|||||||
conf: SafeKeeperConf,
|
conf: SafeKeeperConf,
|
||||||
broker_active_set: Arc<TimelinesSet>,
|
broker_active_set: Arc<TimelinesSet>,
|
||||||
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
|
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
|
||||||
partial_backup_rate_limiter: RateLimiter,
|
global_rate_limiter: RateLimiter,
|
||||||
) -> Manager {
|
) -> Manager {
|
||||||
let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
|
let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
|
||||||
Manager {
|
Manager {
|
||||||
conf,
|
|
||||||
wal_seg_size: tli.get_wal_seg_size().await,
|
wal_seg_size: tli.get_wal_seg_size().await,
|
||||||
walsenders: tli.get_walsenders().clone(),
|
walsenders: tli.get_walsenders().clone(),
|
||||||
state_version_rx: tli.get_state_version_rx(),
|
state_version_rx: tli.get_state_version_rx(),
|
||||||
@@ -353,8 +373,10 @@ impl Manager {
|
|||||||
partial_backup_uploaded,
|
partial_backup_uploaded,
|
||||||
access_service: AccessService::new(manager_tx),
|
access_service: AccessService::new(manager_tx),
|
||||||
tli,
|
tli,
|
||||||
partial_backup_rate_limiter,
|
global_rate_limiter,
|
||||||
resident_since: std::time::Instant::now(),
|
// to smooth out evictions spike after restart
|
||||||
|
evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
|
||||||
|
conf,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -541,7 +563,7 @@ impl Manager {
|
|||||||
self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
|
self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
|
||||||
self.wal_resident_timeline(),
|
self.wal_resident_timeline(),
|
||||||
self.conf.clone(),
|
self.conf.clone(),
|
||||||
self.partial_backup_rate_limiter.clone(),
|
self.global_rate_limiter.clone(),
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,10 +2,11 @@
|
|||||||
//! All timelines should always be present in this map, this is done by loading them
|
//! All timelines should always be present in this map, this is done by loading them
|
||||||
//! all from the disk on startup and keeping them in memory.
|
//! all from the disk on startup and keeping them in memory.
|
||||||
|
|
||||||
|
use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
|
||||||
|
use crate::rate_limit::RateLimiter;
|
||||||
use crate::safekeeper::ServerInfo;
|
use crate::safekeeper::ServerInfo;
|
||||||
use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
|
use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
|
||||||
use crate::timelines_set::TimelinesSet;
|
use crate::timelines_set::TimelinesSet;
|
||||||
use crate::wal_backup_partial::RateLimiter;
|
|
||||||
use crate::SafeKeeperConf;
|
use crate::SafeKeeperConf;
|
||||||
use anyhow::{bail, Context, Result};
|
use anyhow::{bail, Context, Result};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
@@ -31,7 +32,7 @@ struct GlobalTimelinesState {
|
|||||||
conf: Option<SafeKeeperConf>,
|
conf: Option<SafeKeeperConf>,
|
||||||
broker_active_set: Arc<TimelinesSet>,
|
broker_active_set: Arc<TimelinesSet>,
|
||||||
load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
|
load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
|
||||||
partial_backup_rate_limiter: RateLimiter,
|
global_rate_limiter: RateLimiter,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used to prevent concurrent timeline loading.
|
// Used to prevent concurrent timeline loading.
|
||||||
@@ -50,7 +51,7 @@ impl GlobalTimelinesState {
|
|||||||
(
|
(
|
||||||
self.get_conf().clone(),
|
self.get_conf().clone(),
|
||||||
self.broker_active_set.clone(),
|
self.broker_active_set.clone(),
|
||||||
self.partial_backup_rate_limiter.clone(),
|
self.global_rate_limiter.clone(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -85,7 +86,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
|
|||||||
conf: None,
|
conf: None,
|
||||||
broker_active_set: Arc::new(TimelinesSet::default()),
|
broker_active_set: Arc::new(TimelinesSet::default()),
|
||||||
load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
|
load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
|
||||||
partial_backup_rate_limiter: RateLimiter::new(1),
|
global_rate_limiter: RateLimiter::new(1, 1),
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -99,7 +100,10 @@ impl GlobalTimelines {
|
|||||||
// lock, so use explicit block
|
// lock, so use explicit block
|
||||||
let tenants_dir = {
|
let tenants_dir = {
|
||||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||||
state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
|
state.global_rate_limiter = RateLimiter::new(
|
||||||
|
conf.partial_backup_concurrency,
|
||||||
|
DEFAULT_EVICTION_CONCURRENCY,
|
||||||
|
);
|
||||||
state.conf = Some(conf);
|
state.conf = Some(conf);
|
||||||
|
|
||||||
// Iterate through all directories and load tenants for all directories
|
// Iterate through all directories and load tenants for all directories
|
||||||
|
|||||||
@@ -18,8 +18,6 @@
|
|||||||
//! This way control file stores information about all potentially existing
|
//! This way control file stores information about all potentially existing
|
||||||
//! remote partial segments and can clean them up after uploading a newer version.
|
//! remote partial segments and can clean them up after uploading a newer version.
|
||||||
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
|
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
|
||||||
use remote_storage::RemotePath;
|
use remote_storage::RemotePath;
|
||||||
@@ -30,6 +28,7 @@ use utils::lsn::Lsn;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
|
metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
|
||||||
|
rate_limit::{rand_duration, RateLimiter},
|
||||||
safekeeper::Term,
|
safekeeper::Term,
|
||||||
timeline::WalResidentTimeline,
|
timeline::WalResidentTimeline,
|
||||||
timeline_manager::StateSnapshot,
|
timeline_manager::StateSnapshot,
|
||||||
@@ -37,30 +36,6 @@ use crate::{
|
|||||||
SafeKeeperConf,
|
SafeKeeperConf,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct RateLimiter {
|
|
||||||
semaphore: Arc<tokio::sync::Semaphore>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RateLimiter {
|
|
||||||
pub fn new(permits: usize) -> Self {
|
|
||||||
Self {
|
|
||||||
semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
|
|
||||||
let _timer = MISC_OPERATION_SECONDS
|
|
||||||
.with_label_values(&["partial_permit_acquire"])
|
|
||||||
.start_timer();
|
|
||||||
self.semaphore
|
|
||||||
.clone()
|
|
||||||
.acquire_owned()
|
|
||||||
.await
|
|
||||||
.expect("semaphore is closed")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
pub enum UploadStatus {
|
pub enum UploadStatus {
|
||||||
/// Upload is in progress. This status should be used only for garbage collection,
|
/// Upload is in progress. This status should be used only for garbage collection,
|
||||||
@@ -352,6 +327,7 @@ pub async fn main_task(
|
|||||||
) -> Option<PartialRemoteSegment> {
|
) -> Option<PartialRemoteSegment> {
|
||||||
debug!("started");
|
debug!("started");
|
||||||
let await_duration = conf.partial_backup_timeout;
|
let await_duration = conf.partial_backup_timeout;
|
||||||
|
let mut first_iteration = true;
|
||||||
|
|
||||||
let (_, persistent_state) = tli.get_state().await;
|
let (_, persistent_state) = tli.get_state().await;
|
||||||
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
|
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
|
||||||
@@ -419,6 +395,15 @@ pub async fn main_task(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// smoothing the load after restart, by sleeping for a random time.
|
||||||
|
// if this is not the first iteration, we will wait for the full await_duration
|
||||||
|
let await_duration = if first_iteration {
|
||||||
|
first_iteration = false;
|
||||||
|
rand_duration(&await_duration)
|
||||||
|
} else {
|
||||||
|
await_duration
|
||||||
|
};
|
||||||
|
|
||||||
// fixing the segno and waiting some time to prevent reuploading the same segment too often
|
// fixing the segno and waiting some time to prevent reuploading the same segment too often
|
||||||
let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
|
let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
|
||||||
let timeout = tokio::time::sleep(await_duration);
|
let timeout = tokio::time::sleep(await_duration);
|
||||||
@@ -454,7 +439,7 @@ pub async fn main_task(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// limit concurrent uploads
|
// limit concurrent uploads
|
||||||
let _upload_permit = limiter.acquire_owned().await;
|
let _upload_permit = limiter.acquire_partial_backup().await;
|
||||||
|
|
||||||
let prepared = backup.prepare_upload().await;
|
let prepared = backup.prepare_upload().await;
|
||||||
if let Some(seg) = &uploaded_segment {
|
if let Some(seg) = &uploaded_segment {
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ FALLBACK_DURATION = {
|
|||||||
"test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
|
"test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
|
||||||
"test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
|
"test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
|
||||||
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
|
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
|
||||||
|
"test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
|
||||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
|
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
|
||||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
|
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
|
||||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
|
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ anyhow.workspace = true
|
|||||||
aws-config.workspace = true
|
aws-config.workspace = true
|
||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
camino.workspace = true
|
camino.workspace = true
|
||||||
|
chrono.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
fail.workspace = true
|
fail.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
@@ -31,6 +32,7 @@ once_cell.workspace = true
|
|||||||
pageserver_api.workspace = true
|
pageserver_api.workspace = true
|
||||||
pageserver_client.workspace = true
|
pageserver_client.workspace = true
|
||||||
postgres_connection.workspace = true
|
postgres_connection.workspace = true
|
||||||
|
rand.workspace = true
|
||||||
reqwest = { workspace = true, features = ["stream"] }
|
reqwest = { workspace = true, features = ["stream"] }
|
||||||
routerify.workspace = true
|
routerify.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
@@ -44,7 +46,12 @@ scopeguard.workspace = true
|
|||||||
strum.workspace = true
|
strum.workspace = true
|
||||||
strum_macros.workspace = true
|
strum_macros.workspace = true
|
||||||
|
|
||||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
diesel = { version = "2.1.4", features = [
|
||||||
|
"serde_json",
|
||||||
|
"postgres",
|
||||||
|
"r2d2",
|
||||||
|
"chrono",
|
||||||
|
] }
|
||||||
diesel_migrations = { version = "2.1.0" }
|
diesel_migrations = { version = "2.1.0" }
|
||||||
r2d2 = { version = "0.8.10" }
|
r2d2 = { version = "0.8.10" }
|
||||||
|
|
||||||
@@ -52,4 +59,3 @@ utils = { path = "../libs/utils/" }
|
|||||||
metrics = { path = "../libs/metrics/" }
|
metrics = { path = "../libs/metrics/" }
|
||||||
control_plane = { path = "../control_plane" }
|
control_plane = { path = "../control_plane" }
|
||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
DROP TABLE metadata_health;
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
CREATE TABLE metadata_health (
|
||||||
|
tenant_id VARCHAR NOT NULL,
|
||||||
|
shard_number INTEGER NOT NULL,
|
||||||
|
shard_count INTEGER NOT NULL,
|
||||||
|
PRIMARY KEY(tenant_id, shard_number, shard_count),
|
||||||
|
-- Rely on cascade behavior for delete
|
||||||
|
FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
|
||||||
|
healthy BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
|
||||||
|
SELECT tenant_id, shard_number, shard_count FROM tenant_shards;
|
||||||
@@ -10,7 +10,11 @@ use hyper::header::CONTENT_TYPE;
|
|||||||
use hyper::{Body, Request, Response};
|
use hyper::{Body, Request, Response};
|
||||||
use hyper::{StatusCode, Uri};
|
use hyper::{StatusCode, Uri};
|
||||||
use metrics::{BuildInfo, NeonMetrics};
|
use metrics::{BuildInfo, NeonMetrics};
|
||||||
use pageserver_api::controller_api::TenantCreateRequest;
|
use pageserver_api::controller_api::{
|
||||||
|
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
|
||||||
|
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
|
||||||
|
TenantCreateRequest,
|
||||||
|
};
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||||
TenantTimeTravelRequest, TimelineCreateRequest,
|
TenantTimeTravelRequest, TimelineCreateRequest,
|
||||||
@@ -560,6 +564,51 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
|
|||||||
json_response(StatusCode::ACCEPTED, ())
|
json_response(StatusCode::ACCEPTED, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
check_permissions(&req, Scope::Scrubber)?;
|
||||||
|
|
||||||
|
let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
|
||||||
|
let state = get_state(&req);
|
||||||
|
|
||||||
|
state.service.metadata_health_update(update_req).await?;
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_metadata_health_list_unhealthy(
|
||||||
|
req: Request<Body>,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
check_permissions(&req, Scope::Admin)?;
|
||||||
|
|
||||||
|
let state = get_state(&req);
|
||||||
|
let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
|
||||||
|
|
||||||
|
json_response(
|
||||||
|
StatusCode::OK,
|
||||||
|
MetadataHealthListUnhealthyResponse {
|
||||||
|
unhealthy_tenant_shards,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_metadata_health_list_outdated(
|
||||||
|
mut req: Request<Body>,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
check_permissions(&req, Scope::Admin)?;
|
||||||
|
|
||||||
|
let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
|
||||||
|
let state = get_state(&req);
|
||||||
|
let health_records = state
|
||||||
|
.service
|
||||||
|
.metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
json_response(
|
||||||
|
StatusCode::OK,
|
||||||
|
MetadataHealthListOutdatedResponse { health_records },
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
async fn handle_tenant_shard_split(
|
async fn handle_tenant_shard_split(
|
||||||
service: Arc<Service>,
|
service: Arc<Service>,
|
||||||
mut req: Request<Body>,
|
mut req: Request<Body>,
|
||||||
@@ -987,6 +1036,28 @@ pub fn make_router(
|
|||||||
RequestName("control_v1_cancel_node_fill"),
|
RequestName("control_v1_cancel_node_fill"),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
// Metadata health operations
|
||||||
|
.post("/control/v1/metadata_health/update", |r| {
|
||||||
|
named_request_span(
|
||||||
|
r,
|
||||||
|
handle_metadata_health_update,
|
||||||
|
RequestName("control_v1_metadata_health_update"),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.get("/control/v1/metadata_health/unhealthy", |r| {
|
||||||
|
named_request_span(
|
||||||
|
r,
|
||||||
|
handle_metadata_health_list_unhealthy,
|
||||||
|
RequestName("control_v1_metadata_health_list_unhealthy"),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.post("/control/v1/metadata_health/outdated", |r| {
|
||||||
|
named_request_span(
|
||||||
|
r,
|
||||||
|
handle_metadata_health_list_outdated,
|
||||||
|
RequestName("control_v1_metadata_health_list_outdated"),
|
||||||
|
)
|
||||||
|
})
|
||||||
// TODO(vlad): endpoint for cancelling drain and fill
|
// TODO(vlad): endpoint for cancelling drain and fill
|
||||||
// Tenant Shard operations
|
// Tenant Shard operations
|
||||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||||
|
|||||||
@@ -9,12 +9,14 @@ use std::time::Duration;
|
|||||||
use storage_controller::http::make_router;
|
use storage_controller::http::make_router;
|
||||||
use storage_controller::metrics::preinitialize_metrics;
|
use storage_controller::metrics::preinitialize_metrics;
|
||||||
use storage_controller::persistence::Persistence;
|
use storage_controller::persistence::Persistence;
|
||||||
|
use storage_controller::service::chaos_injector::ChaosInjector;
|
||||||
use storage_controller::service::{
|
use storage_controller::service::{
|
||||||
Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
|
Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
|
||||||
RECONCILER_CONCURRENCY_DEFAULT,
|
RECONCILER_CONCURRENCY_DEFAULT,
|
||||||
};
|
};
|
||||||
use tokio::signal::unix::SignalKind;
|
use tokio::signal::unix::SignalKind;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
|
use tracing::Instrument;
|
||||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||||
use utils::logging::{self, LogFormat};
|
use utils::logging::{self, LogFormat};
|
||||||
|
|
||||||
@@ -86,6 +88,10 @@ struct Cli {
|
|||||||
// TODO: make `cfg(feature = "testing")`
|
// TODO: make `cfg(feature = "testing")`
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
neon_local_repo_dir: Option<PathBuf>,
|
neon_local_repo_dir: Option<PathBuf>,
|
||||||
|
|
||||||
|
/// Chaos testing
|
||||||
|
#[arg(long)]
|
||||||
|
chaos_interval: Option<humantime::Duration>,
|
||||||
}
|
}
|
||||||
|
|
||||||
enum StrictMode {
|
enum StrictMode {
|
||||||
@@ -309,6 +315,22 @@ async fn async_main() -> anyhow::Result<()> {
|
|||||||
tracing::info!("Serving on {0}", args.listen);
|
tracing::info!("Serving on {0}", args.listen);
|
||||||
let server_task = tokio::task::spawn(server);
|
let server_task = tokio::task::spawn(server);
|
||||||
|
|
||||||
|
let chaos_task = args.chaos_interval.map(|interval| {
|
||||||
|
let service = service.clone();
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
let cancel_bg = cancel.clone();
|
||||||
|
(
|
||||||
|
tokio::task::spawn(
|
||||||
|
async move {
|
||||||
|
let mut chaos_injector = ChaosInjector::new(service, interval.into());
|
||||||
|
chaos_injector.run(cancel_bg).await
|
||||||
|
}
|
||||||
|
.instrument(tracing::info_span!("chaos_injector")),
|
||||||
|
),
|
||||||
|
cancel,
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
// Wait until we receive a signal
|
// Wait until we receive a signal
|
||||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
|
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
|
||||||
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
|
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
|
||||||
@@ -337,6 +359,12 @@ async fn async_main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we were injecting chaos, stop that so that we're not calling into Service while it shuts down
|
||||||
|
if let Some((chaos_jh, chaos_cancel)) = chaos_task {
|
||||||
|
chaos_cancel.cancel();
|
||||||
|
chaos_jh.await.ok();
|
||||||
|
}
|
||||||
|
|
||||||
service.shutdown().await;
|
service.shutdown().await;
|
||||||
tracing::info!("Service shutdown complete");
|
tracing::info!("Service shutdown complete");
|
||||||
|
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ use self::split_state::SplitState;
|
|||||||
use diesel::pg::PgConnection;
|
use diesel::pg::PgConnection;
|
||||||
use diesel::prelude::*;
|
use diesel::prelude::*;
|
||||||
use diesel::Connection;
|
use diesel::Connection;
|
||||||
|
use pageserver_api::controller_api::MetadataHealthRecord;
|
||||||
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
||||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||||
use pageserver_api::models::TenantConfig;
|
use pageserver_api::models::TenantConfig;
|
||||||
@@ -90,6 +91,10 @@ pub(crate) enum DatabaseOperation {
|
|||||||
UpdateTenantShard,
|
UpdateTenantShard,
|
||||||
DeleteTenant,
|
DeleteTenant,
|
||||||
UpdateTenantConfig,
|
UpdateTenantConfig,
|
||||||
|
UpdateMetadataHealth,
|
||||||
|
ListMetadataHealth,
|
||||||
|
ListMetadataHealthUnhealthy,
|
||||||
|
ListMetadataHealthOutdated,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[must_use]
|
#[must_use]
|
||||||
@@ -307,15 +312,32 @@ impl Persistence {
|
|||||||
&self,
|
&self,
|
||||||
shards: Vec<TenantShardPersistence>,
|
shards: Vec<TenantShardPersistence>,
|
||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::metadata_health;
|
||||||
|
use crate::schema::tenant_shards;
|
||||||
|
|
||||||
|
let now = chrono::Utc::now();
|
||||||
|
|
||||||
|
let metadata_health_records = shards
|
||||||
|
.iter()
|
||||||
|
.map(|t| MetadataHealthPersistence {
|
||||||
|
tenant_id: t.tenant_id.clone(),
|
||||||
|
shard_number: t.shard_number,
|
||||||
|
shard_count: t.shard_count,
|
||||||
|
healthy: true,
|
||||||
|
last_scrubbed_at: now,
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
self.with_measured_conn(
|
self.with_measured_conn(
|
||||||
DatabaseOperation::InsertTenantShards,
|
DatabaseOperation::InsertTenantShards,
|
||||||
move |conn| -> DatabaseResult<()> {
|
move |conn| -> DatabaseResult<()> {
|
||||||
for tenant in &shards {
|
diesel::insert_into(tenant_shards::table)
|
||||||
diesel::insert_into(tenant_shards)
|
.values(&shards)
|
||||||
.values(tenant)
|
.execute(conn)?;
|
||||||
.execute(conn)?;
|
|
||||||
}
|
diesel::insert_into(metadata_health::table)
|
||||||
|
.values(&metadata_health_records)
|
||||||
|
.execute(conn)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -329,10 +351,10 @@ impl Persistence {
|
|||||||
self.with_measured_conn(
|
self.with_measured_conn(
|
||||||
DatabaseOperation::DeleteTenant,
|
DatabaseOperation::DeleteTenant,
|
||||||
move |conn| -> DatabaseResult<()> {
|
move |conn| -> DatabaseResult<()> {
|
||||||
|
// `metadata_health` status (if exists) is also deleted based on the cascade behavior.
|
||||||
diesel::delete(tenant_shards)
|
diesel::delete(tenant_shards)
|
||||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||||
.execute(conn)?;
|
.execute(conn)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -675,6 +697,94 @@ impl Persistence {
|
|||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
|
||||||
|
///
|
||||||
|
/// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub(crate) async fn update_metadata_health_records(
|
||||||
|
&self,
|
||||||
|
healthy_records: Vec<MetadataHealthPersistence>,
|
||||||
|
unhealthy_records: Vec<MetadataHealthPersistence>,
|
||||||
|
now: chrono::DateTime<chrono::Utc>,
|
||||||
|
) -> DatabaseResult<()> {
|
||||||
|
use crate::schema::metadata_health::dsl::*;
|
||||||
|
|
||||||
|
self.with_measured_conn(
|
||||||
|
DatabaseOperation::UpdateMetadataHealth,
|
||||||
|
move |conn| -> DatabaseResult<_> {
|
||||||
|
diesel::insert_into(metadata_health)
|
||||||
|
.values(&healthy_records)
|
||||||
|
.on_conflict((tenant_id, shard_number, shard_count))
|
||||||
|
.do_update()
|
||||||
|
.set((healthy.eq(true), last_scrubbed_at.eq(now)))
|
||||||
|
.execute(conn)?;
|
||||||
|
|
||||||
|
diesel::insert_into(metadata_health)
|
||||||
|
.values(&unhealthy_records)
|
||||||
|
.on_conflict((tenant_id, shard_number, shard_count))
|
||||||
|
.do_update()
|
||||||
|
.set((healthy.eq(false), last_scrubbed_at.eq(now)))
|
||||||
|
.execute(conn)?;
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lists all the metadata health records.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub(crate) async fn list_metadata_health_records(
|
||||||
|
&self,
|
||||||
|
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
|
||||||
|
self.with_measured_conn(
|
||||||
|
DatabaseOperation::ListMetadataHealth,
|
||||||
|
move |conn| -> DatabaseResult<_> {
|
||||||
|
Ok(
|
||||||
|
crate::schema::metadata_health::table
|
||||||
|
.load::<MetadataHealthPersistence>(conn)?,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lists all the metadata health records that is unhealthy.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub(crate) async fn list_unhealthy_metadata_health_records(
|
||||||
|
&self,
|
||||||
|
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
|
||||||
|
use crate::schema::metadata_health::dsl::*;
|
||||||
|
self.with_measured_conn(
|
||||||
|
DatabaseOperation::ListMetadataHealthUnhealthy,
|
||||||
|
move |conn| -> DatabaseResult<_> {
|
||||||
|
Ok(crate::schema::metadata_health::table
|
||||||
|
.filter(healthy.eq(false))
|
||||||
|
.load::<MetadataHealthPersistence>(conn)?)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lists all the metadata health records that have not been updated since an `earlier` time.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub(crate) async fn list_outdated_metadata_health_records(
|
||||||
|
&self,
|
||||||
|
earlier: chrono::DateTime<chrono::Utc>,
|
||||||
|
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
|
||||||
|
use crate::schema::metadata_health::dsl::*;
|
||||||
|
|
||||||
|
self.with_measured_conn(
|
||||||
|
DatabaseOperation::ListMetadataHealthOutdated,
|
||||||
|
move |conn| -> DatabaseResult<_> {
|
||||||
|
let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
|
||||||
|
let res = query.load::<MetadataHealthPersistence>(conn)?;
|
||||||
|
|
||||||
|
Ok(res)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||||
@@ -744,3 +854,59 @@ pub(crate) struct NodePersistence {
|
|||||||
pub(crate) listen_pg_addr: String,
|
pub(crate) listen_pg_addr: String,
|
||||||
pub(crate) listen_pg_port: i32,
|
pub(crate) listen_pg_port: i32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Tenant metadata health status that are stored durably.
|
||||||
|
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
|
||||||
|
#[diesel(table_name = crate::schema::metadata_health)]
|
||||||
|
pub(crate) struct MetadataHealthPersistence {
|
||||||
|
#[serde(default)]
|
||||||
|
pub(crate) tenant_id: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub(crate) shard_number: i32,
|
||||||
|
#[serde(default)]
|
||||||
|
pub(crate) shard_count: i32,
|
||||||
|
|
||||||
|
pub(crate) healthy: bool,
|
||||||
|
pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetadataHealthPersistence {
|
||||||
|
pub fn new(
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
healthy: bool,
|
||||||
|
last_scrubbed_at: chrono::DateTime<chrono::Utc>,
|
||||||
|
) -> Self {
|
||||||
|
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||||
|
let shard_number = tenant_shard_id.shard_number.0 as i32;
|
||||||
|
let shard_count = tenant_shard_id.shard_count.literal() as i32;
|
||||||
|
|
||||||
|
MetadataHealthPersistence {
|
||||||
|
tenant_id,
|
||||||
|
shard_number,
|
||||||
|
shard_count,
|
||||||
|
healthy,
|
||||||
|
last_scrubbed_at,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
|
||||||
|
Ok(TenantShardId {
|
||||||
|
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
|
||||||
|
shard_number: ShardNumber(self.shard_number as u8),
|
||||||
|
shard_count: ShardCount::new(self.shard_count as u8),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<MetadataHealthPersistence> for MetadataHealthRecord {
|
||||||
|
fn from(value: MetadataHealthPersistence) -> Self {
|
||||||
|
MetadataHealthRecord {
|
||||||
|
tenant_shard_id: value
|
||||||
|
.get_tenant_shard_id()
|
||||||
|
.expect("stored tenant id should be valid"),
|
||||||
|
healthy: value.healthy,
|
||||||
|
last_scrubbed_at: value.last_scrubbed_at,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -656,11 +656,8 @@ impl Reconciler {
|
|||||||
// reconcile this location. This includes locations with different configurations, as well
|
// reconcile this location. This includes locations with different configurations, as well
|
||||||
// as locations with unknown (None) observed state.
|
// as locations with unknown (None) observed state.
|
||||||
|
|
||||||
// The general case is to increment the generation. However, there are cases
|
// Incrementing generation is the safe general case, but is inefficient for changes that only
|
||||||
// where this is not necessary:
|
// modify some details (e.g. the tenant's config).
|
||||||
// - if we are only updating the TenantConf part of the location
|
|
||||||
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
|
|
||||||
// and the location was already in the correct generation
|
|
||||||
let increment_generation = match observed {
|
let increment_generation = match observed {
|
||||||
None => true,
|
None => true,
|
||||||
Some(ObservedStateLocation { conf: None }) => true,
|
Some(ObservedStateLocation { conf: None }) => true,
|
||||||
@@ -669,18 +666,11 @@ impl Reconciler {
|
|||||||
}) => {
|
}) => {
|
||||||
let generations_match = observed.generation == wanted_conf.generation;
|
let generations_match = observed.generation == wanted_conf.generation;
|
||||||
|
|
||||||
use LocationConfigMode::*;
|
// We may skip incrementing the generation if the location is already in the expected mode and
|
||||||
let mode_transition_requires_gen_inc =
|
// generation. In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
|
||||||
match (observed.mode, wanted_conf.mode) {
|
// but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
|
||||||
// Usually the short-lived attachment modes (multi and stale) are only used
|
// after a restart/crash, so fall back to the universally safe path of incrementing generation.
|
||||||
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
|
!generations_match || (observed.mode != wanted_conf.mode)
|
||||||
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
|
|
||||||
(AttachedSingle, AttachedStale) => false,
|
|
||||||
(AttachedMulti, AttachedSingle) => false,
|
|
||||||
(lhs, rhs) => lhs != rhs,
|
|
||||||
};
|
|
||||||
|
|
||||||
!generations_match || mode_transition_requires_gen_inc
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,15 @@
|
|||||||
// @generated automatically by Diesel CLI.
|
// @generated automatically by Diesel CLI.
|
||||||
|
|
||||||
|
diesel::table! {
|
||||||
|
metadata_health (tenant_id, shard_number, shard_count) {
|
||||||
|
tenant_id -> Varchar,
|
||||||
|
shard_number -> Int4,
|
||||||
|
shard_count -> Int4,
|
||||||
|
healthy -> Bool,
|
||||||
|
last_scrubbed_at -> Timestamptz,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
diesel::table! {
|
diesel::table! {
|
||||||
nodes (node_id) {
|
nodes (node_id) {
|
||||||
node_id -> Int8,
|
node_id -> Int8,
|
||||||
@@ -26,4 +36,4 @@ diesel::table! {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
|
diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ use crate::{
|
|||||||
compute_hook::NotifyError,
|
compute_hook::NotifyError,
|
||||||
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
|
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
|
||||||
metrics::LeadershipStatusGroup,
|
metrics::LeadershipStatusGroup,
|
||||||
persistence::{AbortShardSplitStatus, TenantFilter},
|
persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
|
||||||
reconciler::{ReconcileError, ReconcileUnits},
|
reconciler::{ReconcileError, ReconcileUnits},
|
||||||
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
|
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
|
||||||
tenant_shard::{
|
tenant_shard::{
|
||||||
@@ -33,11 +33,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
|
|||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
controller_api::{
|
controller_api::{
|
||||||
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
|
||||||
ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
|
NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
|
||||||
TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
|
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
|
||||||
TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
|
TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
|
||||||
TenantShardMigrateResponse, UtilizationScore,
|
TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
|
||||||
},
|
},
|
||||||
models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
|
models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
|
||||||
};
|
};
|
||||||
@@ -84,6 +84,8 @@ use crate::{
|
|||||||
};
|
};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
pub mod chaos_injector;
|
||||||
|
|
||||||
// For operations that should be quick, like attaching a new tenant
|
// For operations that should be quick, like attaching a new tenant
|
||||||
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
|
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
|
||||||
|
|
||||||
@@ -6095,6 +6097,68 @@ impl Service {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Updates scrubber metadata health check results.
|
||||||
|
pub(crate) async fn metadata_health_update(
|
||||||
|
&self,
|
||||||
|
update_req: MetadataHealthUpdateRequest,
|
||||||
|
) -> Result<(), ApiError> {
|
||||||
|
let now = chrono::offset::Utc::now();
|
||||||
|
let (healthy_records, unhealthy_records) = {
|
||||||
|
let locked = self.inner.read().unwrap();
|
||||||
|
let healthy_records = update_req
|
||||||
|
.healthy_tenant_shards
|
||||||
|
.into_iter()
|
||||||
|
// Retain only health records associated with tenant shards managed by storage controller.
|
||||||
|
.filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
|
||||||
|
.map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
|
||||||
|
.collect();
|
||||||
|
let unhealthy_records = update_req
|
||||||
|
.unhealthy_tenant_shards
|
||||||
|
.into_iter()
|
||||||
|
.filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
|
||||||
|
.map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
(healthy_records, unhealthy_records)
|
||||||
|
};
|
||||||
|
|
||||||
|
self.persistence
|
||||||
|
.update_metadata_health_records(healthy_records, unhealthy_records, now)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lists the tenant shards that has unhealthy metadata status.
|
||||||
|
pub(crate) async fn metadata_health_list_unhealthy(
|
||||||
|
&self,
|
||||||
|
) -> Result<Vec<TenantShardId>, ApiError> {
|
||||||
|
let result = self
|
||||||
|
.persistence
|
||||||
|
.list_unhealthy_metadata_health_records()
|
||||||
|
.await?
|
||||||
|
.iter()
|
||||||
|
.map(|p| p.get_tenant_shard_id().unwrap())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lists the tenant shards that have not been scrubbed for some duration.
|
||||||
|
pub(crate) async fn metadata_health_list_outdated(
|
||||||
|
&self,
|
||||||
|
not_scrubbed_for: Duration,
|
||||||
|
) -> Result<Vec<MetadataHealthRecord>, ApiError> {
|
||||||
|
let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
|
||||||
|
let result = self
|
||||||
|
.persistence
|
||||||
|
.list_outdated_metadata_health_records(earlier)
|
||||||
|
.await?
|
||||||
|
.into_iter()
|
||||||
|
.map(|record| record.into())
|
||||||
|
.collect();
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
|
pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
|
||||||
self.inner.read().unwrap().get_leadership_status()
|
self.inner.read().unwrap().get_leadership_status()
|
||||||
}
|
}
|
||||||
|
|||||||
71
storage_controller/src/service/chaos_injector.rs
Normal file
71
storage_controller/src/service/chaos_injector.rs
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
use std::{sync::Arc, time::Duration};
|
||||||
|
|
||||||
|
use rand::seq::SliceRandom;
|
||||||
|
use rand::thread_rng;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
|
use super::Service;
|
||||||
|
|
||||||
|
pub struct ChaosInjector {
|
||||||
|
service: Arc<Service>,
|
||||||
|
interval: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ChaosInjector {
|
||||||
|
pub fn new(service: Arc<Service>, interval: Duration) -> Self {
|
||||||
|
Self { service, interval }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn run(&mut self, cancel: CancellationToken) {
|
||||||
|
let mut interval = tokio::time::interval(self.interval);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = interval.tick() => {}
|
||||||
|
_ = cancel.cancelled() => {
|
||||||
|
tracing::info!("Shutting down");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.inject_chaos().await;
|
||||||
|
|
||||||
|
tracing::info!("Chaos iteration...");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn inject_chaos(&mut self) {
|
||||||
|
// Pick some shards to interfere with
|
||||||
|
let batch_size = 128;
|
||||||
|
let mut inner = self.service.inner.write().unwrap();
|
||||||
|
let (nodes, tenants, scheduler) = inner.parts_mut();
|
||||||
|
let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
|
||||||
|
let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
|
||||||
|
|
||||||
|
for victim in victims {
|
||||||
|
let shard = tenants
|
||||||
|
.get_mut(victim)
|
||||||
|
.expect("Held lock between choosing ID and this get");
|
||||||
|
|
||||||
|
// Pick a secondary to promote
|
||||||
|
let Some(new_location) = shard
|
||||||
|
.intent
|
||||||
|
.get_secondary()
|
||||||
|
.choose(&mut thread_rng())
|
||||||
|
.cloned()
|
||||||
|
else {
|
||||||
|
tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
let Some(old_location) = *shard.intent.get_attached() else {
|
||||||
|
tracing::info!("Skipping shard {victim}: currently has no attached location");
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
shard.intent.demote_attached(scheduler, old_location);
|
||||||
|
shard.intent.promote_attached(scheduler, new_location);
|
||||||
|
self.service.maybe_reconcile_shard(shard, nodes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -10,6 +10,7 @@ aws-smithy-async.workspace = true
|
|||||||
either.workspace = true
|
either.workspace = true
|
||||||
tokio-rustls.workspace = true
|
tokio-rustls.workspace = true
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
|
git-version.workspace = true
|
||||||
hex.workspace = true
|
hex.workspace = true
|
||||||
humantime.workspace = true
|
humantime.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
|
|||||||
@@ -40,6 +40,11 @@ impl TimelineAnalysis {
|
|||||||
garbage_keys: Vec::new(),
|
garbage_keys: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Whether a timeline is healthy.
|
||||||
|
pub(crate) fn is_healthy(&self) -> bool {
|
||||||
|
self.errors.is_empty() && self.warnings.is_empty()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn branch_cleanup_and_check_errors(
|
pub(crate) async fn branch_cleanup_and_check_errors(
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
|
use std::pin::pin;
|
||||||
|
|
||||||
use futures::{StreamExt, TryStreamExt};
|
use futures::{StreamExt, TryStreamExt};
|
||||||
use pageserver::tenant::storage_layer::LayerName;
|
use pageserver::tenant::storage_layer::LayerName;
|
||||||
|
use remote_storage::ListingMode;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
checks::parse_layer_object_name, init_remote, list_objects_with_retries,
|
checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
|
||||||
metadata_stream::stream_tenants, BucketConfig, NodeKind,
|
stream_objects_with_retries, BucketConfig, NodeKind,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
|
#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
|
||||||
@@ -47,45 +50,38 @@ pub async fn find_large_objects(
|
|||||||
ignore_deltas: bool,
|
ignore_deltas: bool,
|
||||||
concurrency: usize,
|
concurrency: usize,
|
||||||
) -> anyhow::Result<LargeObjectListing> {
|
) -> anyhow::Result<LargeObjectListing> {
|
||||||
let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
|
let (remote_client, target) =
|
||||||
let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
|
init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
|
||||||
|
let tenants = pin!(stream_tenants_generic(&remote_client, &target));
|
||||||
|
|
||||||
let objects_stream = tenants.map_ok(|tenant_shard_id| {
|
let objects_stream = tenants.map_ok(|tenant_shard_id| {
|
||||||
let mut tenant_root = target.tenant_root(&tenant_shard_id);
|
let mut tenant_root = target.tenant_root(&tenant_shard_id);
|
||||||
let s3_client = s3_client.clone();
|
let remote_client = remote_client.clone();
|
||||||
async move {
|
async move {
|
||||||
let mut objects = Vec::new();
|
let mut objects = Vec::new();
|
||||||
let mut total_objects_ctr = 0u64;
|
let mut total_objects_ctr = 0u64;
|
||||||
// We want the objects and not just common prefixes
|
// We want the objects and not just common prefixes
|
||||||
tenant_root.delimiter.clear();
|
tenant_root.delimiter.clear();
|
||||||
let mut continuation_token = None;
|
let mut objects_stream = pin!(stream_objects_with_retries(
|
||||||
loop {
|
&remote_client,
|
||||||
let fetch_response =
|
ListingMode::NoDelimiter,
|
||||||
list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
|
&tenant_root
|
||||||
.await?;
|
));
|
||||||
for obj in fetch_response.contents().iter().filter(|o| {
|
while let Some(listing) = objects_stream.next().await {
|
||||||
if let Some(obj_size) = o.size {
|
let listing = listing?;
|
||||||
min_size as i64 <= obj_size
|
for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
|
||||||
} else {
|
let key = obj.key.to_string();
|
||||||
false
|
|
||||||
}
|
|
||||||
}) {
|
|
||||||
let key = obj.key().expect("couldn't get key").to_owned();
|
|
||||||
let kind = LargeObjectKind::from_key(&key);
|
let kind = LargeObjectKind::from_key(&key);
|
||||||
if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
|
if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
objects.push(LargeObject {
|
objects.push(LargeObject {
|
||||||
key,
|
key,
|
||||||
size: obj.size.unwrap() as u64,
|
size: obj.size,
|
||||||
kind,
|
kind,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
total_objects_ctr += fetch_response.contents().len() as u64;
|
total_objects_ctr += listing.keys.len() as u64;
|
||||||
match fetch_response.next_continuation_token {
|
|
||||||
Some(new_token) => continuation_token = Some(new_token),
|
|
||||||
None => break,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((tenant_shard_id, objects, total_objects_ctr))
|
Ok((tenant_shard_id, objects, total_objects_ctr))
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
use std::{
|
use std::{
|
||||||
collections::{HashMap, HashSet},
|
collections::{HashMap, HashSet},
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
|
time::Duration,
|
||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
@@ -18,8 +19,8 @@ use utils::id::TenantId;
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
|
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
|
||||||
init_remote, init_remote_generic,
|
init_remote_generic, list_objects_with_retries_generic,
|
||||||
metadata_stream::{stream_tenant_timelines, stream_tenants},
|
metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
|
||||||
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
|
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -27,6 +28,11 @@ use crate::{
|
|||||||
enum GarbageReason {
|
enum GarbageReason {
|
||||||
DeletedInConsole,
|
DeletedInConsole,
|
||||||
MissingInConsole,
|
MissingInConsole,
|
||||||
|
|
||||||
|
// The remaining data relates to a known deletion issue, and we're sure that purging this
|
||||||
|
// will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
|
||||||
|
// there is nothing in a tenant path apart from a heatmap file.
|
||||||
|
KnownBug,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
@@ -72,6 +78,15 @@ impl GarbageList {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// If an entity has been identified as requiring purge due to a known bug, e.g.
|
||||||
|
/// a particular type of object left behind after an incomplete deletion.
|
||||||
|
fn append_buggy(&mut self, entity: GarbageEntity) {
|
||||||
|
self.items.push(GarbageItem {
|
||||||
|
entity,
|
||||||
|
reason: GarbageReason::KnownBug,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/// Return true if appended, false if not. False means the result was not garbage.
|
/// Return true if appended, false if not. False means the result was not garbage.
|
||||||
fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
|
fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
|
||||||
where
|
where
|
||||||
@@ -138,7 +153,7 @@ async fn find_garbage_inner(
|
|||||||
node_kind: NodeKind,
|
node_kind: NodeKind,
|
||||||
) -> anyhow::Result<GarbageList> {
|
) -> anyhow::Result<GarbageList> {
|
||||||
// Construct clients for S3 and for Console API
|
// Construct clients for S3 and for Console API
|
||||||
let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
|
let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
|
||||||
let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
|
let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
|
||||||
|
|
||||||
// Build a set of console-known tenants, for quickly eliminating known-active tenants without having
|
// Build a set of console-known tenants, for quickly eliminating known-active tenants without having
|
||||||
@@ -164,7 +179,7 @@ async fn find_garbage_inner(
|
|||||||
|
|
||||||
// Enumerate Tenants in S3, and check if each one exists in Console
|
// Enumerate Tenants in S3, and check if each one exists in Console
|
||||||
tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
|
tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
|
||||||
let tenants = stream_tenants(&s3_client, &target);
|
let tenants = stream_tenants_generic(&remote_client, &target);
|
||||||
let tenants_checked = tenants.map_ok(|t| {
|
let tenants_checked = tenants.map_ok(|t| {
|
||||||
let api_client = cloud_admin_api_client.clone();
|
let api_client = cloud_admin_api_client.clone();
|
||||||
let console_cache = console_cache.clone();
|
let console_cache = console_cache.clone();
|
||||||
@@ -219,6 +234,66 @@ async fn find_garbage_inner(
|
|||||||
assert!(project.tenant == tenant_shard_id.tenant_id);
|
assert!(project.tenant == tenant_shard_id.tenant_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Special case: If it's missing in console, check for known bugs that would enable us to conclusively
|
||||||
|
// identify it as purge-able anyway
|
||||||
|
if console_result.is_none() {
|
||||||
|
let timelines =
|
||||||
|
stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
|
||||||
|
.await?
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.await;
|
||||||
|
if timelines.is_empty() {
|
||||||
|
// No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
|
||||||
|
let tenant_objects = list_objects_with_retries_generic(
|
||||||
|
&remote_client,
|
||||||
|
ListingMode::WithDelimiter,
|
||||||
|
&target.tenant_root(&tenant_shard_id),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let object = tenant_objects.keys.first().unwrap();
|
||||||
|
if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
|
||||||
|
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
|
||||||
|
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
|
||||||
|
// rollout of WAL DR in which we never deleted these.
|
||||||
|
let mut any_non_initdb = false;
|
||||||
|
|
||||||
|
for timeline_r in timelines {
|
||||||
|
let timeline = timeline_r?;
|
||||||
|
let timeline_objects = list_objects_with_retries_generic(
|
||||||
|
&remote_client,
|
||||||
|
ListingMode::WithDelimiter,
|
||||||
|
&target.timeline_root(&timeline),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
if !timeline_objects.prefixes.is_empty() {
|
||||||
|
// Sub-paths? Unexpected
|
||||||
|
any_non_initdb = true;
|
||||||
|
} else {
|
||||||
|
let object = timeline_objects.keys.first().unwrap();
|
||||||
|
if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
|
||||||
|
tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
|
||||||
|
} else {
|
||||||
|
any_non_initdb = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if any_non_initdb {
|
||||||
|
tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
|
||||||
|
} else {
|
||||||
|
tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
|
||||||
|
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
|
if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
|
||||||
tracing::debug!("Tenant {tenant_shard_id} is garbage");
|
tracing::debug!("Tenant {tenant_shard_id} is garbage");
|
||||||
} else {
|
} else {
|
||||||
@@ -256,7 +331,8 @@ async fn find_garbage_inner(
|
|||||||
|
|
||||||
// Construct a stream of all timelines within active tenants
|
// Construct a stream of all timelines within active tenants
|
||||||
let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
|
let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
|
||||||
let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
|
let timelines =
|
||||||
|
active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
|
||||||
let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
|
let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
|
||||||
let timelines = timelines.try_flatten();
|
let timelines = timelines.try_flatten();
|
||||||
|
|
||||||
@@ -349,9 +425,6 @@ pub async fn get_timeline_objects(
|
|||||||
tracing::debug!("Listing objects in timeline {ttid}");
|
tracing::debug!("Listing objects in timeline {ttid}");
|
||||||
let timeline_root = super::remote_timeline_path_id(&ttid);
|
let timeline_root = super::remote_timeline_path_id(&ttid);
|
||||||
|
|
||||||
// TODO: apply extra validation based on object modification time. Don't purge
|
|
||||||
// timelines whose index_part.json has been touched recently.
|
|
||||||
|
|
||||||
let list = s3_client
|
let list = s3_client
|
||||||
.list(
|
.list(
|
||||||
Some(&timeline_root),
|
Some(&timeline_root),
|
||||||
@@ -422,6 +495,7 @@ impl DeletionProgressTracker {
|
|||||||
pub async fn purge_garbage(
|
pub async fn purge_garbage(
|
||||||
input_path: String,
|
input_path: String,
|
||||||
mode: PurgeMode,
|
mode: PurgeMode,
|
||||||
|
min_age: Duration,
|
||||||
dry_run: bool,
|
dry_run: bool,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let list_bytes = tokio::fs::read(&input_path).await?;
|
let list_bytes = tokio::fs::read(&input_path).await?;
|
||||||
@@ -432,7 +506,7 @@ pub async fn purge_garbage(
|
|||||||
input_path
|
input_path
|
||||||
);
|
);
|
||||||
|
|
||||||
let remote_client =
|
let (remote_client, _target) =
|
||||||
init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
|
init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -459,6 +533,7 @@ pub async fn purge_garbage(
|
|||||||
.filter(|i| match (&mode, &i.reason) {
|
.filter(|i| match (&mode, &i.reason) {
|
||||||
(PurgeMode::DeletedAndMissing, _) => true,
|
(PurgeMode::DeletedAndMissing, _) => true,
|
||||||
(PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
|
(PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
|
||||||
|
(PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
|
||||||
(PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
|
(PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -487,6 +562,37 @@ pub async fn purge_garbage(
|
|||||||
let mut progress_tracker = DeletionProgressTracker::default();
|
let mut progress_tracker = DeletionProgressTracker::default();
|
||||||
while let Some(result) = get_objects_results.next().await {
|
while let Some(result) = get_objects_results.next().await {
|
||||||
let mut object_list = result?;
|
let mut object_list = result?;
|
||||||
|
|
||||||
|
// Extra safety check: even if a collection of objects is garbage, check max() of modification
|
||||||
|
// times before purging, so that if we incorrectly marked a live tenant as garbage then we would
|
||||||
|
// notice that its index has been written recently and would omit deleting it.
|
||||||
|
if object_list.is_empty() {
|
||||||
|
// Simplify subsequent code by ensuring list always has at least one item
|
||||||
|
// Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
|
||||||
|
let age = max_mtime.elapsed();
|
||||||
|
match age {
|
||||||
|
Err(_) => {
|
||||||
|
tracing::warn!("Bad last_modified time");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Ok(a) if a < min_age => {
|
||||||
|
// Failed age check. This doesn't mean we did something wrong: a tenant might really be garbage and recently
|
||||||
|
// written, but out of an abundance of caution we still don't purge it.
|
||||||
|
tracing::info!(
|
||||||
|
"Skipping tenant with young objects {}..{}",
|
||||||
|
object_list.first().as_ref().unwrap().key,
|
||||||
|
object_list.last().as_ref().unwrap().key
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Ok(_) => {
|
||||||
|
// Passed age check
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
objects_to_delete.append(&mut object_list);
|
objects_to_delete.append(&mut object_list);
|
||||||
if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
|
if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
|
||||||
do_delete(
|
do_delete(
|
||||||
|
|||||||
@@ -16,22 +16,26 @@ use std::sync::Arc;
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use anyhow::{anyhow, Context};
|
use anyhow::{anyhow, Context};
|
||||||
|
use aws_config::retry::{RetryConfigBuilder, RetryMode};
|
||||||
use aws_sdk_s3::config::Region;
|
use aws_sdk_s3::config::Region;
|
||||||
use aws_sdk_s3::error::DisplayErrorContext;
|
use aws_sdk_s3::error::DisplayErrorContext;
|
||||||
use aws_sdk_s3::Client;
|
use aws_sdk_s3::Client;
|
||||||
|
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use clap::ValueEnum;
|
use clap::ValueEnum;
|
||||||
|
use futures::{Stream, StreamExt};
|
||||||
use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
|
use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
|
||||||
use pageserver::tenant::TENANTS_SEGMENT_NAME;
|
use pageserver::tenant::TENANTS_SEGMENT_NAME;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use remote_storage::{
|
use remote_storage::{
|
||||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||||
};
|
};
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use storage_controller_client::control_api;
|
||||||
use tokio::io::AsyncReadExt;
|
use tokio::io::AsyncReadExt;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::error;
|
use tracing::error;
|
||||||
use tracing_appender::non_blocking::WorkerGuard;
|
use tracing_appender::non_blocking::WorkerGuard;
|
||||||
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
|
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
|
||||||
@@ -253,6 +257,12 @@ pub struct ControllerClientConfig {
|
|||||||
pub controller_jwt: String,
|
pub controller_jwt: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl ControllerClientConfig {
|
||||||
|
pub fn build_client(self) -> control_api::Client {
|
||||||
|
control_api::Client::new(self.controller_api, Some(self.controller_jwt))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct ConsoleConfig {
|
pub struct ConsoleConfig {
|
||||||
pub token: String,
|
pub token: String,
|
||||||
pub base_url: Url,
|
pub base_url: Url,
|
||||||
@@ -305,8 +315,15 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn init_s3_client(bucket_region: Region) -> Client {
|
async fn init_s3_client(bucket_region: Region) -> Client {
|
||||||
|
let mut retry_config_builder = RetryConfigBuilder::new();
|
||||||
|
|
||||||
|
retry_config_builder
|
||||||
|
.set_max_attempts(Some(3))
|
||||||
|
.set_mode(Some(RetryMode::Adaptive));
|
||||||
|
|
||||||
let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
|
let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
|
||||||
.region(bucket_region)
|
.region(bucket_region)
|
||||||
|
.retry_config(retry_config_builder.build())
|
||||||
.load()
|
.load()
|
||||||
.await;
|
.await;
|
||||||
Client::new(&config)
|
Client::new(&config)
|
||||||
@@ -319,27 +336,35 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn make_root_target(
|
||||||
|
bucket_name: String,
|
||||||
|
prefix_in_bucket: String,
|
||||||
|
node_kind: NodeKind,
|
||||||
|
) -> RootTarget {
|
||||||
|
let s3_target = S3Target {
|
||||||
|
bucket_name,
|
||||||
|
prefix_in_bucket,
|
||||||
|
delimiter: "/".to_string(),
|
||||||
|
};
|
||||||
|
match node_kind {
|
||||||
|
NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
|
||||||
|
NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn init_remote(
|
async fn init_remote(
|
||||||
bucket_config: BucketConfig,
|
bucket_config: BucketConfig,
|
||||||
node_kind: NodeKind,
|
node_kind: NodeKind,
|
||||||
) -> anyhow::Result<(Arc<Client>, RootTarget)> {
|
) -> anyhow::Result<(Arc<Client>, RootTarget)> {
|
||||||
let bucket_region = Region::new(bucket_config.region);
|
let bucket_region = Region::new(bucket_config.region);
|
||||||
let delimiter = "/".to_string();
|
|
||||||
let s3_client = Arc::new(init_s3_client(bucket_region).await);
|
let s3_client = Arc::new(init_s3_client(bucket_region).await);
|
||||||
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
|
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
|
||||||
|
|
||||||
let s3_root = match node_kind {
|
let s3_root = make_root_target(
|
||||||
NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
|
bucket_config.bucket,
|
||||||
bucket_name: bucket_config.bucket,
|
bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
|
||||||
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
|
node_kind,
|
||||||
delimiter,
|
);
|
||||||
}),
|
|
||||||
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
|
|
||||||
bucket_name: bucket_config.bucket,
|
|
||||||
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
|
|
||||||
delimiter,
|
|
||||||
}),
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok((s3_client, s3_root))
|
Ok((s3_client, s3_root))
|
||||||
}
|
}
|
||||||
@@ -347,12 +372,12 @@ async fn init_remote(
|
|||||||
async fn init_remote_generic(
|
async fn init_remote_generic(
|
||||||
bucket_config: BucketConfig,
|
bucket_config: BucketConfig,
|
||||||
node_kind: NodeKind,
|
node_kind: NodeKind,
|
||||||
) -> anyhow::Result<GenericRemoteStorage> {
|
) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
|
||||||
let endpoint = env::var("AWS_ENDPOINT_URL").ok();
|
let endpoint = env::var("AWS_ENDPOINT_URL").ok();
|
||||||
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
|
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
|
||||||
let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
|
let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
|
||||||
let storage = S3Config {
|
let storage = S3Config {
|
||||||
bucket_name: bucket_config.bucket,
|
bucket_name: bucket_config.bucket.clone(),
|
||||||
bucket_region: bucket_config.region,
|
bucket_region: bucket_config.region,
|
||||||
prefix_in_bucket,
|
prefix_in_bucket,
|
||||||
endpoint,
|
endpoint,
|
||||||
@@ -366,7 +391,13 @@ async fn init_remote_generic(
|
|||||||
storage: RemoteStorageKind::AwsS3(storage),
|
storage: RemoteStorageKind::AwsS3(storage),
|
||||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||||
};
|
};
|
||||||
GenericRemoteStorage::from_config(&storage_config).await
|
|
||||||
|
// We already pass the prefix to the remote client above
|
||||||
|
let prefix_in_root_target = String::new();
|
||||||
|
let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
|
||||||
|
|
||||||
|
let client = GenericRemoteStorage::from_config(&storage_config).await?;
|
||||||
|
Ok((client, s3_root))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn list_objects_with_retries(
|
async fn list_objects_with_retries(
|
||||||
@@ -404,6 +435,84 @@ async fn list_objects_with_retries(
|
|||||||
Err(anyhow!("unreachable unless MAX_RETRIES==0"))
|
Err(anyhow!("unreachable unless MAX_RETRIES==0"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Listing possibly large amounts of keys in a streaming fashion.
|
||||||
|
fn stream_objects_with_retries<'a>(
|
||||||
|
storage_client: &'a GenericRemoteStorage,
|
||||||
|
listing_mode: ListingMode,
|
||||||
|
s3_target: &'a S3Target,
|
||||||
|
) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
|
||||||
|
async_stream::stream! {
|
||||||
|
let mut trial = 0;
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
let prefix_str = &s3_target
|
||||||
|
.prefix_in_bucket
|
||||||
|
.strip_prefix("/")
|
||||||
|
.unwrap_or(&s3_target.prefix_in_bucket);
|
||||||
|
let prefix = RemotePath::from_string(prefix_str)?;
|
||||||
|
let mut list_stream =
|
||||||
|
storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
|
||||||
|
while let Some(res) = list_stream.next().await {
|
||||||
|
if let Err(err) = res {
|
||||||
|
let yield_err = if err.is_permanent() {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
let backoff_time = 1 << trial.max(5);
|
||||||
|
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
|
||||||
|
trial += 1;
|
||||||
|
trial == MAX_RETRIES - 1
|
||||||
|
};
|
||||||
|
if yield_err {
|
||||||
|
yield Err(err)
|
||||||
|
.with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
trial = 0;
|
||||||
|
yield res.map_err(anyhow::Error::from);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
|
||||||
|
/// use [`stream_objects_with_retries`] instead.
|
||||||
|
async fn list_objects_with_retries_generic(
|
||||||
|
remote_client: &GenericRemoteStorage,
|
||||||
|
listing_mode: ListingMode,
|
||||||
|
s3_target: &S3Target,
|
||||||
|
) -> anyhow::Result<Listing> {
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
let prefix_str = &s3_target
|
||||||
|
.prefix_in_bucket
|
||||||
|
.strip_prefix("/")
|
||||||
|
.unwrap_or(&s3_target.prefix_in_bucket);
|
||||||
|
let prefix = RemotePath::from_string(prefix_str)?;
|
||||||
|
for trial in 0..MAX_RETRIES {
|
||||||
|
match remote_client
|
||||||
|
.list(Some(&prefix), listing_mode, None, &cancel)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(response) => return Ok(response),
|
||||||
|
Err(e) => {
|
||||||
|
if trial == MAX_RETRIES - 1 {
|
||||||
|
return Err(e)
|
||||||
|
.with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
|
||||||
|
}
|
||||||
|
error!(
|
||||||
|
"list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
|
||||||
|
s3_target.bucket_name,
|
||||||
|
s3_target.prefix_in_bucket,
|
||||||
|
s3_target.delimiter,
|
||||||
|
DisplayErrorContext(e),
|
||||||
|
);
|
||||||
|
let backoff_time = 1 << trial.max(5);
|
||||||
|
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
panic!("MAX_RETRIES is not allowed to be 0");
|
||||||
|
}
|
||||||
|
|
||||||
async fn download_object_with_retries(
|
async fn download_object_with_retries(
|
||||||
s3_client: &Client,
|
s3_client: &Client,
|
||||||
bucket_name: &str,
|
bucket_name: &str,
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
use anyhow::{anyhow, bail};
|
use anyhow::{anyhow, bail};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
|
use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use reqwest::Url;
|
use reqwest::{Method, Url};
|
||||||
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||||
use storage_scrubber::pageserver_physical_gc::GcMode;
|
use storage_scrubber::pageserver_physical_gc::GcMode;
|
||||||
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
|
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
|
||||||
@@ -16,6 +17,11 @@ use storage_scrubber::{
|
|||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
use utils::id::TenantId;
|
use utils::id::TenantId;
|
||||||
|
|
||||||
|
use utils::{project_build_tag, project_git_version};
|
||||||
|
|
||||||
|
project_git_version!(GIT_VERSION);
|
||||||
|
project_build_tag!(BUILD_TAG);
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
#[command(author, version, about, long_about = None)]
|
#[command(author, version, about, long_about = None)]
|
||||||
#[command(arg_required_else_help(true))]
|
#[command(arg_required_else_help(true))]
|
||||||
@@ -50,6 +56,8 @@ enum Command {
|
|||||||
input_path: String,
|
input_path: String,
|
||||||
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
|
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
|
||||||
mode: PurgeMode,
|
mode: PurgeMode,
|
||||||
|
#[arg(long = "min-age")]
|
||||||
|
min_age: humantime::Duration,
|
||||||
},
|
},
|
||||||
#[command(verbatim_doc_comment)]
|
#[command(verbatim_doc_comment)]
|
||||||
ScanMetadata {
|
ScanMetadata {
|
||||||
@@ -59,6 +67,8 @@ enum Command {
|
|||||||
json: bool,
|
json: bool,
|
||||||
#[arg(long = "tenant-id", num_args = 0..)]
|
#[arg(long = "tenant-id", num_args = 0..)]
|
||||||
tenant_ids: Vec<TenantShardId>,
|
tenant_ids: Vec<TenantShardId>,
|
||||||
|
#[arg(long = "post", default_value_t = false)]
|
||||||
|
post_to_storage_controller: bool,
|
||||||
#[arg(long, default_value = None)]
|
#[arg(long, default_value = None)]
|
||||||
/// For safekeeper node_kind only, points to db with debug dump
|
/// For safekeeper node_kind only, points to db with debug dump
|
||||||
dump_db_connstr: Option<String>,
|
dump_db_connstr: Option<String>,
|
||||||
@@ -96,6 +106,8 @@ enum Command {
|
|||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
let cli = Cli::parse();
|
let cli = Cli::parse();
|
||||||
|
|
||||||
|
tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
|
||||||
|
|
||||||
let bucket_config = BucketConfig::from_env()?;
|
let bucket_config = BucketConfig::from_env()?;
|
||||||
|
|
||||||
let command_log_name = match &cli.command {
|
let command_log_name = match &cli.command {
|
||||||
@@ -114,11 +126,20 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
|
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
|
||||||
));
|
));
|
||||||
|
|
||||||
|
let controller_client_conf = cli.controller_api.map(|controller_api| {
|
||||||
|
ControllerClientConfig {
|
||||||
|
controller_api,
|
||||||
|
// Default to no key: this is a convenience when working in a development environment
|
||||||
|
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
match cli.command {
|
match cli.command {
|
||||||
Command::ScanMetadata {
|
Command::ScanMetadata {
|
||||||
json,
|
json,
|
||||||
tenant_ids,
|
tenant_ids,
|
||||||
node_kind,
|
node_kind,
|
||||||
|
post_to_storage_controller,
|
||||||
dump_db_connstr,
|
dump_db_connstr,
|
||||||
dump_db_table,
|
dump_db_table,
|
||||||
} => {
|
} => {
|
||||||
@@ -157,6 +178,9 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
} else {
|
} else {
|
||||||
|
if controller_client_conf.is_none() && post_to_storage_controller {
|
||||||
|
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
|
||||||
|
}
|
||||||
match scan_metadata(bucket_config.clone(), tenant_ids).await {
|
match scan_metadata(bucket_config.clone(), tenant_ids).await {
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
tracing::error!("Failed: {e}");
|
tracing::error!("Failed: {e}");
|
||||||
@@ -168,6 +192,21 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
} else {
|
} else {
|
||||||
println!("{}", summary.summary_string());
|
println!("{}", summary.summary_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if post_to_storage_controller {
|
||||||
|
if let Some(conf) = controller_client_conf {
|
||||||
|
let controller_client = conf.build_client();
|
||||||
|
let body = summary.build_health_update_request();
|
||||||
|
controller_client
|
||||||
|
.dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
|
||||||
|
Method::POST,
|
||||||
|
"control/v1/metadata_health/update".to_string(),
|
||||||
|
Some(body),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if summary.is_fatal() {
|
if summary.is_fatal() {
|
||||||
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
||||||
} else if summary.is_empty() {
|
} else if summary.is_empty() {
|
||||||
@@ -196,9 +235,11 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
let console_config = ConsoleConfig::from_env()?;
|
let console_config = ConsoleConfig::from_env()?;
|
||||||
find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
|
find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
|
||||||
}
|
}
|
||||||
Command::PurgeGarbage { input_path, mode } => {
|
Command::PurgeGarbage {
|
||||||
purge_garbage(input_path, mode, !cli.delete).await
|
input_path,
|
||||||
}
|
mode,
|
||||||
|
min_age,
|
||||||
|
} => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await,
|
||||||
Command::TenantSnapshot {
|
Command::TenantSnapshot {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
output_path,
|
output_path,
|
||||||
@@ -213,14 +254,6 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
min_age,
|
min_age,
|
||||||
mode,
|
mode,
|
||||||
} => {
|
} => {
|
||||||
let controller_client_conf = cli.controller_api.map(|controller_api| {
|
|
||||||
ControllerClientConfig {
|
|
||||||
controller_api,
|
|
||||||
// Default to no key: this is a convenience when working in a development environment
|
|
||||||
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
match (&controller_client_conf, mode) {
|
match (&controller_client_conf, mode) {
|
||||||
(Some(_), _) => {
|
(Some(_), _) => {
|
||||||
// Any mode may run when controller API is set
|
// Any mode may run when controller API is set
|
||||||
|
|||||||
@@ -1,12 +1,41 @@
|
|||||||
use anyhow::Context;
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context};
|
||||||
use async_stream::{stream, try_stream};
|
use async_stream::{stream, try_stream};
|
||||||
use aws_sdk_s3::{types::ObjectIdentifier, Client};
|
use aws_sdk_s3::{types::ObjectIdentifier, Client};
|
||||||
|
use futures::StreamExt;
|
||||||
|
use remote_storage::{GenericRemoteStorage, ListingMode};
|
||||||
use tokio_stream::Stream;
|
use tokio_stream::Stream;
|
||||||
|
|
||||||
use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
|
use crate::{
|
||||||
|
list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
|
||||||
|
TenantShardTimelineId,
|
||||||
|
};
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
|
/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
|
||||||
|
pub fn stream_tenants_generic<'a>(
|
||||||
|
remote_client: &'a GenericRemoteStorage,
|
||||||
|
target: &'a RootTarget,
|
||||||
|
) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
|
||||||
|
try_stream! {
|
||||||
|
let tenants_target = target.tenants_root();
|
||||||
|
let mut tenants_stream =
|
||||||
|
std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
|
||||||
|
while let Some(chunk) = tenants_stream.next().await {
|
||||||
|
let chunk = chunk?;
|
||||||
|
let entry_ids = chunk.prefixes.iter()
|
||||||
|
.map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'")));
|
||||||
|
for dir_name_res in entry_ids {
|
||||||
|
let dir_name = dir_name_res?;
|
||||||
|
let id = TenantShardId::from_str(dir_name)?;
|
||||||
|
yield id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
|
/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
|
||||||
pub fn stream_tenants<'a>(
|
pub fn stream_tenants<'a>(
|
||||||
s3_client: &'a Client,
|
s3_client: &'a Client,
|
||||||
@@ -160,6 +189,63 @@ pub async fn stream_tenant_timelines<'a>(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
|
||||||
|
/// using a listing. The listing is done before the stream is built, so that this
|
||||||
|
/// function can be used to generate concurrency on a stream using buffer_unordered.
|
||||||
|
pub async fn stream_tenant_timelines_generic<'a>(
|
||||||
|
remote_client: &'a GenericRemoteStorage,
|
||||||
|
target: &'a RootTarget,
|
||||||
|
tenant: TenantShardId,
|
||||||
|
) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
|
||||||
|
let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
|
||||||
|
let timelines_target = target.timelines_root(&tenant);
|
||||||
|
|
||||||
|
let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
|
||||||
|
remote_client,
|
||||||
|
ListingMode::WithDelimiter,
|
||||||
|
&timelines_target
|
||||||
|
));
|
||||||
|
loop {
|
||||||
|
tracing::debug!("Listing in {tenant}");
|
||||||
|
let fetch_response = match objects_stream.next().await {
|
||||||
|
None => break,
|
||||||
|
Some(Err(e)) => {
|
||||||
|
timeline_ids.push(Err(e));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Some(Ok(r)) => r,
|
||||||
|
};
|
||||||
|
|
||||||
|
let new_entry_ids = fetch_response
|
||||||
|
.prefixes
|
||||||
|
.iter()
|
||||||
|
.filter_map(|prefix| -> Option<&str> {
|
||||||
|
prefix
|
||||||
|
.get_path()
|
||||||
|
.as_str()
|
||||||
|
.strip_prefix(&timelines_target.prefix_in_bucket)?
|
||||||
|
.strip_suffix('/')
|
||||||
|
})
|
||||||
|
.map(|entry_id_str| {
|
||||||
|
entry_id_str
|
||||||
|
.parse::<TimelineId>()
|
||||||
|
.with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
|
||||||
|
});
|
||||||
|
|
||||||
|
for i in new_entry_ids {
|
||||||
|
timeline_ids.push(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::debug!("Yielding for {}", tenant);
|
||||||
|
Ok(stream! {
|
||||||
|
for i in timeline_ids {
|
||||||
|
let id = i?;
|
||||||
|
yield Ok(TenantShardTimelineId::new(tenant, id));
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn stream_listing<'a>(
|
pub(crate) fn stream_listing<'a>(
|
||||||
s3_client: &'a Client,
|
s3_client: &'a Client,
|
||||||
target: &'a S3Target,
|
target: &'a S3Target,
|
||||||
|
|||||||
@@ -567,13 +567,7 @@ pub async fn pageserver_physical_gc(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
|
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
|
||||||
let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
|
let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
|
||||||
let ControllerClientConfig {
|
|
||||||
controller_api,
|
|
||||||
controller_jwt,
|
|
||||||
} = c;
|
|
||||||
control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
|
|
||||||
}) else {
|
|
||||||
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
|
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
|
||||||
return Ok(summary);
|
return Ok(summary);
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -9,12 +9,13 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
|
|||||||
use aws_sdk_s3::Client;
|
use aws_sdk_s3::Client;
|
||||||
use futures_util::{StreamExt, TryStreamExt};
|
use futures_util::{StreamExt, TryStreamExt};
|
||||||
use pageserver::tenant::remote_timeline_client::remote_layer_path;
|
use pageserver::tenant::remote_timeline_client::remote_layer_path;
|
||||||
|
use pageserver_api::controller_api::MetadataHealthUpdateRequest;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use utils::id::TenantId;
|
use utils::id::TenantId;
|
||||||
use utils::shard::ShardCount;
|
use utils::shard::ShardCount;
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize, Default)]
|
||||||
pub struct MetadataSummary {
|
pub struct MetadataSummary {
|
||||||
tenant_count: usize,
|
tenant_count: usize,
|
||||||
timeline_count: usize,
|
timeline_count: usize,
|
||||||
@@ -23,19 +24,16 @@ pub struct MetadataSummary {
|
|||||||
with_warnings: HashSet<TenantShardTimelineId>,
|
with_warnings: HashSet<TenantShardTimelineId>,
|
||||||
with_orphans: HashSet<TenantShardTimelineId>,
|
with_orphans: HashSet<TenantShardTimelineId>,
|
||||||
indices_by_version: HashMap<usize, usize>,
|
indices_by_version: HashMap<usize, usize>,
|
||||||
|
|
||||||
|
#[serde(skip)]
|
||||||
|
pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
|
||||||
|
#[serde(skip)]
|
||||||
|
pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MetadataSummary {
|
impl MetadataSummary {
|
||||||
fn new() -> Self {
|
fn new() -> Self {
|
||||||
Self {
|
Self::default()
|
||||||
tenant_count: 0,
|
|
||||||
timeline_count: 0,
|
|
||||||
timeline_shard_count: 0,
|
|
||||||
with_errors: HashSet::new(),
|
|
||||||
with_warnings: HashSet::new(),
|
|
||||||
with_orphans: HashSet::new(),
|
|
||||||
indices_by_version: HashMap::new(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_data(&mut self, data: &S3TimelineBlobData) {
|
fn update_data(&mut self, data: &S3TimelineBlobData) {
|
||||||
@@ -54,6 +52,13 @@ impl MetadataSummary {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
|
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
|
||||||
|
if analysis.is_healthy() {
|
||||||
|
self.healthy_tenant_shards.insert(id.tenant_shard_id);
|
||||||
|
} else {
|
||||||
|
self.healthy_tenant_shards.remove(&id.tenant_shard_id);
|
||||||
|
self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
|
||||||
|
}
|
||||||
|
|
||||||
if !analysis.errors.is_empty() {
|
if !analysis.errors.is_empty() {
|
||||||
self.with_errors.insert(*id);
|
self.with_errors.insert(*id);
|
||||||
}
|
}
|
||||||
@@ -101,6 +106,13 @@ Index versions: {version_summary}
|
|||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
self.timeline_shard_count == 0
|
self.timeline_shard_count == 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
|
||||||
|
MetadataHealthUpdateRequest {
|
||||||
|
healthy_tenant_shards: self.healthy_tenant_shards.clone(),
|
||||||
|
unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
|
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
|
||||||
|
|||||||
@@ -150,6 +150,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
|||||||
"pageserver_pitr_history_size",
|
"pageserver_pitr_history_size",
|
||||||
"pageserver_layer_bytes",
|
"pageserver_layer_bytes",
|
||||||
"pageserver_layer_count",
|
"pageserver_layer_count",
|
||||||
|
"pageserver_visible_physical_size",
|
||||||
"pageserver_storage_operations_seconds_count_total",
|
"pageserver_storage_operations_seconds_count_total",
|
||||||
"pageserver_storage_operations_seconds_sum_total",
|
"pageserver_storage_operations_seconds_sum_total",
|
||||||
"pageserver_evictions_total",
|
"pageserver_evictions_total",
|
||||||
|
|||||||
@@ -449,6 +449,7 @@ class TokenScope(str, Enum):
|
|||||||
GENERATIONS_API = "generations_api"
|
GENERATIONS_API = "generations_api"
|
||||||
SAFEKEEPER_DATA = "safekeeperdata"
|
SAFEKEEPER_DATA = "safekeeperdata"
|
||||||
TENANT = "tenant"
|
TENANT = "tenant"
|
||||||
|
SCRUBBER = "scrubber"
|
||||||
|
|
||||||
|
|
||||||
class NeonEnvBuilder:
|
class NeonEnvBuilder:
|
||||||
@@ -1942,11 +1943,15 @@ class NeonCli(AbstractNeonCli):
|
|||||||
remote_ext_config: Optional[str] = None,
|
remote_ext_config: Optional[str] = None,
|
||||||
pageserver_id: Optional[int] = None,
|
pageserver_id: Optional[int] = None,
|
||||||
allow_multiple=False,
|
allow_multiple=False,
|
||||||
|
basebackup_request_tries: Optional[int] = None,
|
||||||
) -> "subprocess.CompletedProcess[str]":
|
) -> "subprocess.CompletedProcess[str]":
|
||||||
args = [
|
args = [
|
||||||
"endpoint",
|
"endpoint",
|
||||||
"start",
|
"start",
|
||||||
]
|
]
|
||||||
|
extra_env_vars = {}
|
||||||
|
if basebackup_request_tries is not None:
|
||||||
|
extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
|
||||||
if remote_ext_config is not None:
|
if remote_ext_config is not None:
|
||||||
args.extend(["--remote-ext-config", remote_ext_config])
|
args.extend(["--remote-ext-config", remote_ext_config])
|
||||||
|
|
||||||
@@ -1959,7 +1964,7 @@ class NeonCli(AbstractNeonCli):
|
|||||||
if allow_multiple:
|
if allow_multiple:
|
||||||
args.extend(["--allow-multiple"])
|
args.extend(["--allow-multiple"])
|
||||||
|
|
||||||
res = self.raw_cli(args)
|
res = self.raw_cli(args, extra_env_vars)
|
||||||
res.check_returncode()
|
res.check_returncode()
|
||||||
return res
|
return res
|
||||||
|
|
||||||
@@ -2586,6 +2591,51 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
|||||||
|
|
||||||
time.sleep(backoff)
|
time.sleep(backoff)
|
||||||
|
|
||||||
|
def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]):
|
||||||
|
body: Dict[str, Any] = {
|
||||||
|
"healthy_tenant_shards": [str(t) for t in healthy],
|
||||||
|
"unhealthy_tenant_shards": [str(t) for t in unhealthy],
|
||||||
|
}
|
||||||
|
|
||||||
|
self.request(
|
||||||
|
"POST",
|
||||||
|
f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
|
||||||
|
json=body,
|
||||||
|
headers=self.headers(TokenScope.SCRUBBER),
|
||||||
|
)
|
||||||
|
|
||||||
|
def metadata_health_list_unhealthy(self):
|
||||||
|
response = self.request(
|
||||||
|
"GET",
|
||||||
|
f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
|
||||||
|
headers=self.headers(TokenScope.ADMIN),
|
||||||
|
)
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def metadata_health_list_outdated(self, duration: str):
|
||||||
|
body: Dict[str, Any] = {"not_scrubbed_for": duration}
|
||||||
|
|
||||||
|
response = self.request(
|
||||||
|
"POST",
|
||||||
|
f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
|
||||||
|
json=body,
|
||||||
|
headers=self.headers(TokenScope.ADMIN),
|
||||||
|
)
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool:
|
||||||
|
"""Metadata is healthy if there is no unhealthy or outdated health records."""
|
||||||
|
|
||||||
|
unhealthy = self.metadata_health_list_unhealthy()
|
||||||
|
outdated = self.metadata_health_list_outdated(outdated_duration)
|
||||||
|
|
||||||
|
healthy = (
|
||||||
|
len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0
|
||||||
|
)
|
||||||
|
if not healthy:
|
||||||
|
log.info(f"{unhealthy=}, {outdated=}")
|
||||||
|
return healthy
|
||||||
|
|
||||||
def step_down(self):
|
def step_down(self):
|
||||||
log.info("Asking storage controller to step down")
|
log.info("Asking storage controller to step down")
|
||||||
response = self.request(
|
response = self.request(
|
||||||
@@ -3766,6 +3816,7 @@ class Endpoint(PgProtocol, LogUtils):
|
|||||||
pageserver_id: Optional[int] = None,
|
pageserver_id: Optional[int] = None,
|
||||||
safekeepers: Optional[List[int]] = None,
|
safekeepers: Optional[List[int]] = None,
|
||||||
allow_multiple: bool = False,
|
allow_multiple: bool = False,
|
||||||
|
basebackup_request_tries: Optional[int] = None,
|
||||||
) -> "Endpoint":
|
) -> "Endpoint":
|
||||||
"""
|
"""
|
||||||
Start the Postgres instance.
|
Start the Postgres instance.
|
||||||
@@ -3787,6 +3838,7 @@ class Endpoint(PgProtocol, LogUtils):
|
|||||||
remote_ext_config=remote_ext_config,
|
remote_ext_config=remote_ext_config,
|
||||||
pageserver_id=pageserver_id,
|
pageserver_id=pageserver_id,
|
||||||
allow_multiple=allow_multiple,
|
allow_multiple=allow_multiple,
|
||||||
|
basebackup_request_tries=basebackup_request_tries,
|
||||||
)
|
)
|
||||||
self._running.release(1)
|
self._running.release(1)
|
||||||
|
|
||||||
@@ -3933,6 +3985,7 @@ class Endpoint(PgProtocol, LogUtils):
|
|||||||
remote_ext_config: Optional[str] = None,
|
remote_ext_config: Optional[str] = None,
|
||||||
pageserver_id: Optional[int] = None,
|
pageserver_id: Optional[int] = None,
|
||||||
allow_multiple=False,
|
allow_multiple=False,
|
||||||
|
basebackup_request_tries: Optional[int] = None,
|
||||||
) -> "Endpoint":
|
) -> "Endpoint":
|
||||||
"""
|
"""
|
||||||
Create an endpoint, apply config, and start Postgres.
|
Create an endpoint, apply config, and start Postgres.
|
||||||
@@ -3953,6 +4006,7 @@ class Endpoint(PgProtocol, LogUtils):
|
|||||||
remote_ext_config=remote_ext_config,
|
remote_ext_config=remote_ext_config,
|
||||||
pageserver_id=pageserver_id,
|
pageserver_id=pageserver_id,
|
||||||
allow_multiple=allow_multiple,
|
allow_multiple=allow_multiple,
|
||||||
|
basebackup_request_tries=basebackup_request_tries,
|
||||||
)
|
)
|
||||||
|
|
||||||
log.info(f"Postgres startup took {time.time() - started_at} seconds")
|
log.info(f"Postgres startup took {time.time() - started_at} seconds")
|
||||||
@@ -3996,6 +4050,7 @@ class EndpointFactory:
|
|||||||
config_lines: Optional[List[str]] = None,
|
config_lines: Optional[List[str]] = None,
|
||||||
remote_ext_config: Optional[str] = None,
|
remote_ext_config: Optional[str] = None,
|
||||||
pageserver_id: Optional[int] = None,
|
pageserver_id: Optional[int] = None,
|
||||||
|
basebackup_request_tries: Optional[int] = None,
|
||||||
) -> Endpoint:
|
) -> Endpoint:
|
||||||
ep = Endpoint(
|
ep = Endpoint(
|
||||||
self.env,
|
self.env,
|
||||||
@@ -4014,6 +4069,7 @@ class EndpointFactory:
|
|||||||
lsn=lsn,
|
lsn=lsn,
|
||||||
remote_ext_config=remote_ext_config,
|
remote_ext_config=remote_ext_config,
|
||||||
pageserver_id=pageserver_id,
|
pageserver_id=pageserver_id,
|
||||||
|
basebackup_request_tries=basebackup_request_tries,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create(
|
def create(
|
||||||
@@ -4355,10 +4411,11 @@ class StorageScrubber:
|
|||||||
assert stdout is not None
|
assert stdout is not None
|
||||||
return stdout
|
return stdout
|
||||||
|
|
||||||
def scan_metadata(self) -> Any:
|
def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
|
||||||
stdout = self.scrubber_cli(
|
args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
|
||||||
["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
|
if post_to_storage_controller:
|
||||||
)
|
args.append("--post")
|
||||||
|
stdout = self.scrubber_cli(args, timeout=30)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return json.loads(stdout)
|
return json.loads(stdout)
|
||||||
@@ -4482,6 +4539,13 @@ def test_output_dir(
|
|||||||
|
|
||||||
yield test_dir
|
yield test_dir
|
||||||
|
|
||||||
|
# Allure artifacts creation might involve the creation of `.tar.zst` archives,
|
||||||
|
# which aren't going to be used if Allure results collection is not enabled
|
||||||
|
# (i.e. --alluredir is not set).
|
||||||
|
# Skip `allure_attach_from_dir` in this case
|
||||||
|
if not request.config.getoption("--alluredir"):
|
||||||
|
return
|
||||||
|
|
||||||
preserve_database_files = False
|
preserve_database_files = False
|
||||||
for k, v in request.node.user_properties:
|
for k, v in request.node.user_properties:
|
||||||
# NB: the neon_env_builder fixture uses this fixture (test_output_dir).
|
# NB: the neon_env_builder fixture uses this fixture (test_output_dir).
|
||||||
|
|||||||
@@ -663,6 +663,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
|||||||
force_image_layer_creation=False,
|
force_image_layer_creation=False,
|
||||||
wait_until_uploaded=False,
|
wait_until_uploaded=False,
|
||||||
compact: Optional[bool] = None,
|
compact: Optional[bool] = None,
|
||||||
|
**kwargs,
|
||||||
):
|
):
|
||||||
self.is_testing_enabled_or_skip()
|
self.is_testing_enabled_or_skip()
|
||||||
query = {}
|
query = {}
|
||||||
@@ -680,6 +681,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
|||||||
res = self.put(
|
res = self.put(
|
||||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
|
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
|
||||||
params=query,
|
params=query,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
log.info(f"Got checkpoint request response code: {res.status_code}")
|
log.info(f"Got checkpoint request response code: {res.status_code}")
|
||||||
self.verbose_error(res)
|
self.verbose_error(res)
|
||||||
|
|||||||
88
test_runner/logical_repl/test_log_repl.py
Normal file
88
test_runner/logical_repl/test_log_repl.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
"""
|
||||||
|
Test the logical replication in Neon with the different consumers
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import time
|
||||||
|
|
||||||
|
import clickhouse_connect
|
||||||
|
import psycopg2
|
||||||
|
import pytest
|
||||||
|
from fixtures.log_helper import log
|
||||||
|
from fixtures.neon_fixtures import RemotePostgres
|
||||||
|
from fixtures.utils import wait_until
|
||||||
|
|
||||||
|
|
||||||
|
def query_clickhouse(
|
||||||
|
client,
|
||||||
|
query: str,
|
||||||
|
digest: str,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Run the query on the client
|
||||||
|
return answer if successful, raise an exception otherwise
|
||||||
|
"""
|
||||||
|
log.debug("Query: %s", query)
|
||||||
|
res = client.query(query)
|
||||||
|
log.debug(res.result_rows)
|
||||||
|
m = hashlib.sha1()
|
||||||
|
m.update(repr(tuple(res.result_rows)).encode())
|
||||||
|
hash_res = m.hexdigest()
|
||||||
|
log.debug("Hash: %s", hash_res)
|
||||||
|
if hash_res == digest:
|
||||||
|
return
|
||||||
|
raise ValueError("Hash mismatch")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.remote_cluster
|
||||||
|
def test_clickhouse(remote_pg: RemotePostgres):
|
||||||
|
"""
|
||||||
|
Test the logical replication having ClickHouse as a client
|
||||||
|
"""
|
||||||
|
conn_options = remote_pg.conn_options()
|
||||||
|
for _ in range(5):
|
||||||
|
try:
|
||||||
|
conn = psycopg2.connect(remote_pg.connstr())
|
||||||
|
except psycopg2.OperationalError as perr:
|
||||||
|
log.debug(perr)
|
||||||
|
time.sleep(1)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
raise TimeoutError
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute("DROP TABLE IF EXISTS table1")
|
||||||
|
cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
|
||||||
|
cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
|
||||||
|
conn.commit()
|
||||||
|
client = clickhouse_connect.get_client(host="clickhouse")
|
||||||
|
client.command("SET allow_experimental_database_materialized_postgresql=1")
|
||||||
|
client.command(
|
||||||
|
"CREATE DATABASE db1_postgres ENGINE = "
|
||||||
|
f"MaterializedPostgreSQL('{conn_options['host']}', "
|
||||||
|
f"'{conn_options['dbname']}', "
|
||||||
|
f"'{conn_options['user']}', '{conn_options['password']}') "
|
||||||
|
"SETTINGS materialized_postgresql_tables_list = 'table1';"
|
||||||
|
)
|
||||||
|
wait_until(
|
||||||
|
120,
|
||||||
|
0.5,
|
||||||
|
lambda: query_clickhouse(
|
||||||
|
client,
|
||||||
|
"select * from db1_postgres.table1 order by 1",
|
||||||
|
"ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
|
||||||
|
conn.commit()
|
||||||
|
wait_until(
|
||||||
|
120,
|
||||||
|
0.5,
|
||||||
|
lambda: query_clickhouse(
|
||||||
|
client,
|
||||||
|
"select * from db1_postgres.table1 order by 1",
|
||||||
|
"9eba2daaf7e4d7d27ac849525f68b562ab53947d",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
log.debug("Sleeping before final checking if Neon is still alive")
|
||||||
|
time.sleep(3)
|
||||||
|
cur.execute("SELECT 1")
|
||||||
@@ -6,21 +6,8 @@ from fixtures.log_helper import log
|
|||||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.timeout(10000)
|
def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str):
|
||||||
def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
|
assert mode == "normal" or mode == "with_snapshots"
|
||||||
"""
|
|
||||||
Test that GC is able to collect all old layers even if them are forming
|
|
||||||
"stairs" and there are not three delta layers since last image layer.
|
|
||||||
|
|
||||||
Information about image layers needed to collect old layers should
|
|
||||||
be propagated by GC to compaction task which should take in in account
|
|
||||||
when make a decision which new image layers needs to be created.
|
|
||||||
|
|
||||||
NB: this test demonstrates the problem. The source tree contained the
|
|
||||||
`gc_feedback` mechanism for about 9 months, but, there were problems
|
|
||||||
with it and it wasn't enabled at runtime.
|
|
||||||
This PR removed the code: https://github.com/neondatabase/neon/pull/6863
|
|
||||||
"""
|
|
||||||
env = neon_env_builder.init_start()
|
env = neon_env_builder.init_start()
|
||||||
client = env.pageserver.http_client()
|
client = env.pageserver.http_client()
|
||||||
|
|
||||||
@@ -74,6 +61,9 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
|
|||||||
|
|
||||||
physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
|
physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
|
||||||
log.info(f"Physical storage size {physical_size}")
|
log.info(f"Physical storage size {physical_size}")
|
||||||
|
if mode == "with_snapshots":
|
||||||
|
if step == n_steps / 2:
|
||||||
|
env.neon_cli.create_branch("child")
|
||||||
|
|
||||||
max_num_of_deltas_above_image = 0
|
max_num_of_deltas_above_image = 0
|
||||||
max_total_num_of_deltas = 0
|
max_total_num_of_deltas = 0
|
||||||
@@ -149,3 +139,37 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
|
|||||||
log.info(f"Writing layer map to {layer_map_path}")
|
log.info(f"Writing layer map to {layer_map_path}")
|
||||||
with layer_map_path.open("w") as f:
|
with layer_map_path.open("w") as f:
|
||||||
f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
|
f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.timeout(10000)
|
||||||
|
def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
|
||||||
|
"""
|
||||||
|
Test that GC is able to collect all old layers even if them are forming
|
||||||
|
"stairs" and there are not three delta layers since last image layer.
|
||||||
|
|
||||||
|
Information about image layers needed to collect old layers should
|
||||||
|
be propagated by GC to compaction task which should take in in account
|
||||||
|
when make a decision which new image layers needs to be created.
|
||||||
|
|
||||||
|
NB: this test demonstrates the problem. The source tree contained the
|
||||||
|
`gc_feedback` mechanism for about 9 months, but, there were problems
|
||||||
|
with it and it wasn't enabled at runtime.
|
||||||
|
This PR removed the code: https://github.com/neondatabase/neon/pull/6863
|
||||||
|
|
||||||
|
And the bottom-most GC-compaction epic resolves the problem.
|
||||||
|
https://github.com/neondatabase/neon/issues/8002
|
||||||
|
"""
|
||||||
|
gc_feedback_impl(neon_env_builder, zenbenchmark, "normal")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.timeout(10000)
|
||||||
|
def test_gc_feedback_with_snapshots(
|
||||||
|
neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle
|
||||||
|
of the benchmark, and the bottom-most compaction should collect as much garbage as possible below the GC
|
||||||
|
horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point,
|
||||||
|
and images covering the full key range (in a delta layer) at the GC horizon.
|
||||||
|
"""
|
||||||
|
gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots")
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active
|
|||||||
from fixtures.utils import query_scalar
|
from fixtures.utils import query_scalar
|
||||||
from performance.test_perf_pgbench import get_scales_matrix
|
from performance.test_perf_pgbench import get_scales_matrix
|
||||||
from requests import RequestException
|
from requests import RequestException
|
||||||
from requests.exceptions import RetryError
|
|
||||||
|
|
||||||
|
|
||||||
# Test branch creation
|
# Test branch creation
|
||||||
@@ -151,7 +150,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
|
|||||||
env.pageserver.allowed_errors.extend(
|
env.pageserver.allowed_errors.extend(
|
||||||
[
|
[
|
||||||
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
|
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
|
||||||
".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading",
|
".*page_service_conn_main.*: query handler for 'basebackup .* ERROR: Not found: Timeline",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
ps_http = env.pageserver.http_client()
|
ps_http = env.pageserver.http_client()
|
||||||
@@ -176,10 +175,12 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
|
|||||||
|
|
||||||
env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
|
env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
|
||||||
|
|
||||||
with pytest.raises(RuntimeError, match="is not active, state: Loading"):
|
with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
|
||||||
env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
|
env.endpoints.create_start(
|
||||||
|
initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
|
||||||
|
)
|
||||||
|
ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
|
||||||
finally:
|
finally:
|
||||||
# FIXME: paused uploads bother shutdown
|
|
||||||
env.pageserver.stop(immediate=True)
|
env.pageserver.stop(immediate=True)
|
||||||
|
|
||||||
t.join()
|
t.join()
|
||||||
@@ -193,8 +194,11 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
|
|||||||
env = neon_env_builder.init_configs()
|
env = neon_env_builder.init_configs()
|
||||||
env.start()
|
env.start()
|
||||||
|
|
||||||
env.pageserver.allowed_errors.append(
|
env.pageserver.allowed_errors.extend(
|
||||||
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
|
[
|
||||||
|
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
|
||||||
|
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: .*Cannot branch off the timeline that's not present in pageserver.*",
|
||||||
|
]
|
||||||
)
|
)
|
||||||
ps_http = env.pageserver.http_client()
|
ps_http = env.pageserver.http_client()
|
||||||
|
|
||||||
@@ -216,7 +220,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
|
|||||||
|
|
||||||
branch_id = TimelineId.generate()
|
branch_id = TimelineId.generate()
|
||||||
|
|
||||||
with pytest.raises(RetryError, match="too many 503 error responses"):
|
with pytest.raises(
|
||||||
|
PageserverApiException,
|
||||||
|
match="Cannot branch off the timeline that's not present in pageserver",
|
||||||
|
):
|
||||||
ps_http.timeline_create(
|
ps_http.timeline_create(
|
||||||
env.pg_version,
|
env.pg_version,
|
||||||
env.initial_tenant,
|
env.initial_tenant,
|
||||||
@@ -389,6 +396,11 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
|
|||||||
repeat_result = ps_http.timeline_create(
|
repeat_result = ps_http.timeline_create(
|
||||||
env.pg_version, env.initial_tenant, success_timeline, timeout=60
|
env.pg_version, env.initial_tenant, success_timeline, timeout=60
|
||||||
)
|
)
|
||||||
|
# remote_consistent_lsn_visible will be published only after we've
|
||||||
|
# confirmed the generation, which is not part of what we await during
|
||||||
|
# timeline creation (uploads). mask it out here to avoid flakyness.
|
||||||
|
del success_result["remote_consistent_lsn_visible"]
|
||||||
|
del repeat_result["remote_consistent_lsn_visible"]
|
||||||
assert repeat_result == success_result
|
assert repeat_result == success_result
|
||||||
finally:
|
finally:
|
||||||
env.pageserver.stop(immediate=True)
|
env.pageserver.stop(immediate=True)
|
||||||
|
|||||||
@@ -3,18 +3,15 @@ import re
|
|||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import toml
|
import toml
|
||||||
from fixtures.common_types import Lsn
|
from fixtures.common_types import Lsn, TenantId, TimelineId
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
from fixtures.neon_fixtures import (
|
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
|
||||||
NeonEnv,
|
|
||||||
NeonEnvBuilder,
|
|
||||||
PgBin,
|
|
||||||
)
|
|
||||||
from fixtures.pageserver.http import PageserverApiException
|
from fixtures.pageserver.http import PageserverApiException
|
||||||
from fixtures.pageserver.utils import (
|
from fixtures.pageserver.utils import (
|
||||||
timeline_delete_wait_completed,
|
timeline_delete_wait_completed,
|
||||||
@@ -22,7 +19,8 @@ from fixtures.pageserver.utils import (
|
|||||||
wait_for_upload,
|
wait_for_upload,
|
||||||
)
|
)
|
||||||
from fixtures.pg_version import PgVersion
|
from fixtures.pg_version import PgVersion
|
||||||
from fixtures.remote_storage import RemoteStorageKind
|
from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
|
||||||
|
from fixtures.workload import Workload
|
||||||
|
|
||||||
#
|
#
|
||||||
# A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
|
# A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
|
||||||
@@ -409,3 +407,133 @@ def dump_differs(
|
|||||||
break
|
break
|
||||||
|
|
||||||
return differs
|
return differs
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HistoricDataSet:
|
||||||
|
name: str
|
||||||
|
tenant_id: TenantId
|
||||||
|
pg_version: PgVersion
|
||||||
|
url: str
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
|
||||||
|
HISTORIC_DATA_SETS = [
|
||||||
|
# From before we enabled image layer compression.
|
||||||
|
# - IndexPart::LATEST_VERSION 7
|
||||||
|
# - STORAGE_FORMAT_VERSION 3
|
||||||
|
HistoricDataSet(
|
||||||
|
"2024-07-18",
|
||||||
|
TenantId("17bf64a53509714687664b3a84e9b3ba"),
|
||||||
|
PgVersion.V16,
|
||||||
|
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("dataset", HISTORIC_DATA_SETS)
|
||||||
|
@pytest.mark.xdist_group("compatibility")
|
||||||
|
def test_historic_storage_formats(
|
||||||
|
neon_env_builder: NeonEnvBuilder,
|
||||||
|
test_output_dir: Path,
|
||||||
|
pg_version: PgVersion,
|
||||||
|
dataset: HistoricDataSet,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
This test is like test_backward_compatibility, but it looks back further to examples of our storage format from long ago.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ARTIFACT_CACHE_DIR = "./artifact_cache"
|
||||||
|
|
||||||
|
import tarfile
|
||||||
|
from contextlib import closing
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import zstandard
|
||||||
|
|
||||||
|
artifact_unpack_path = ARTIFACT_CACHE_DIR / Path("unpacked") / Path(dataset.name)
|
||||||
|
|
||||||
|
# Note: we assume that when running across a matrix of PG versions, the matrix includes all the versions needed by
|
||||||
|
# HISTORIC_DATA_SETS. If we ever remove a PG version from the matrix, then historic datasets built using that version
|
||||||
|
# will no longer be covered by this test.
|
||||||
|
if pg_version != dataset.pg_version:
|
||||||
|
pytest.skip(f"Dataset {dataset} is for different PG version, skipping")
|
||||||
|
|
||||||
|
with closing(requests.get(dataset.url, stream=True)) as r:
|
||||||
|
unzstd = zstandard.ZstdDecompressor()
|
||||||
|
with unzstd.stream_reader(r.raw) as stream:
|
||||||
|
with tarfile.open(mode="r|", fileobj=stream) as tf:
|
||||||
|
tf.extractall(artifact_unpack_path)
|
||||||
|
|
||||||
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||||
|
neon_env_builder.pg_version = dataset.pg_version
|
||||||
|
env = neon_env_builder.init_configs()
|
||||||
|
env.start()
|
||||||
|
assert isinstance(env.pageserver_remote_storage, S3Storage)
|
||||||
|
|
||||||
|
# Link artifact data into test's remote storage. We don't want the whole repo dir, just the remote storage part: we are not testing
|
||||||
|
# compat of local disk data across releases (test_backward_compat does that), we're testing really long-lived data in S3 like layer files and indices.
|
||||||
|
#
|
||||||
|
# The code generating the snapshot uses local_fs, but this test uses S3Storage, so we are copying a tree of files into a bucket. We use
|
||||||
|
# S3Storage so that the scrubber can run (the scrubber doesn't speak local_fs)
|
||||||
|
artifact_pageserver_path = (
|
||||||
|
artifact_unpack_path / Path("repo") / Path("local_fs_remote_storage") / Path("pageserver")
|
||||||
|
)
|
||||||
|
for root, _dirs, files in os.walk(artifact_pageserver_path):
|
||||||
|
for file in files:
|
||||||
|
local_path = os.path.join(root, file)
|
||||||
|
remote_key = (
|
||||||
|
env.pageserver_remote_storage.prefix_in_bucket
|
||||||
|
+ str(local_path)[len(str(artifact_pageserver_path)) :]
|
||||||
|
)
|
||||||
|
log.info(f"Uploading {local_path} -> {remote_key}")
|
||||||
|
env.pageserver_remote_storage.client.upload_file(
|
||||||
|
local_path, env.pageserver_remote_storage.bucket_name, remote_key
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
|
||||||
|
#
|
||||||
|
# Do this _before_ importing to the pageserver, as that import may start writing immediately
|
||||||
|
metadata_summary = env.storage_scrubber.scan_metadata()
|
||||||
|
assert metadata_summary["tenant_count"] >= 1
|
||||||
|
assert metadata_summary["timeline_count"] >= 1
|
||||||
|
assert not metadata_summary["with_errors"]
|
||||||
|
assert not metadata_summary["with_warnings"]
|
||||||
|
|
||||||
|
env.neon_cli.import_tenant(dataset.tenant_id)
|
||||||
|
|
||||||
|
# Discover timelines
|
||||||
|
timelines = env.pageserver.http_client().timeline_list(dataset.tenant_id)
|
||||||
|
# All our artifacts should contain at least one timeline
|
||||||
|
assert len(timelines) > 0
|
||||||
|
|
||||||
|
# TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very
|
||||||
|
# least they should include a mixture of deltas and image layers. Preferably they should also
|
||||||
|
# contain some "exotic" stuff like aux files from logical replication.
|
||||||
|
|
||||||
|
# Check we can start an endpoint and read the SQL that the artifact is meant to contain
|
||||||
|
reference_sql_dump = artifact_unpack_path / Path("dump.sql")
|
||||||
|
ep = env.endpoints.create_start("main", tenant_id=dataset.tenant_id)
|
||||||
|
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||||
|
pg_bin.run_capture(
|
||||||
|
["pg_dumpall", f"--dbname={ep.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]
|
||||||
|
)
|
||||||
|
assert not dump_differs(
|
||||||
|
reference_sql_dump,
|
||||||
|
test_output_dir / "dump.sql",
|
||||||
|
test_output_dir / "dump.filediff",
|
||||||
|
)
|
||||||
|
ep.stop()
|
||||||
|
|
||||||
|
# Check we can also do writes to the database
|
||||||
|
existing_timeline_id = TimelineId(timelines[0]["timeline_id"])
|
||||||
|
workload = Workload(env, dataset.tenant_id, existing_timeline_id)
|
||||||
|
workload.init()
|
||||||
|
workload.write_rows(100)
|
||||||
|
|
||||||
|
# Check that compaction works
|
||||||
|
env.pageserver.http_client().timeline_compact(
|
||||||
|
dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True
|
||||||
|
)
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
|
|||||||
NeonEnvBuilder,
|
NeonEnvBuilder,
|
||||||
wait_for_last_flush_lsn,
|
wait_for_last_flush_lsn,
|
||||||
)
|
)
|
||||||
from fixtures.pageserver.common_types import parse_layer_file_name
|
|
||||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||||
from fixtures.pageserver.utils import (
|
from fixtures.pageserver.utils import (
|
||||||
timeline_delete_wait_completed,
|
timeline_delete_wait_completed,
|
||||||
@@ -313,6 +312,7 @@ def test_remote_storage_upload_queue_retries(
|
|||||||
|
|
||||||
def churn_while_failpoints_active(result):
|
def churn_while_failpoints_active(result):
|
||||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c")
|
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c")
|
||||||
|
# this call will wait for the failpoints to be turned off
|
||||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||||
client.timeline_compact(tenant_id, timeline_id)
|
client.timeline_compact(tenant_id, timeline_id)
|
||||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d")
|
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d")
|
||||||
@@ -332,8 +332,8 @@ def test_remote_storage_upload_queue_retries(
|
|||||||
# Exponential back-off in upload queue, so, gracious timeouts.
|
# Exponential back-off in upload queue, so, gracious timeouts.
|
||||||
|
|
||||||
wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
|
wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
|
||||||
wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
|
wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1))
|
||||||
wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
|
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
|
||||||
|
|
||||||
# unblock churn operations
|
# unblock churn operations
|
||||||
configure_storage_sync_failpoints("off")
|
configure_storage_sync_failpoints("off")
|
||||||
@@ -769,11 +769,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
|
|||||||
create_thread.join()
|
create_thread.join()
|
||||||
|
|
||||||
|
|
||||||
def test_compaction_waits_for_upload(
|
def test_paused_upload_stalls_checkpoint(
|
||||||
neon_env_builder: NeonEnvBuilder,
|
neon_env_builder: NeonEnvBuilder,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
This test forces a race between upload and compaction.
|
This test checks that checkpoints block on uploads to remote storage.
|
||||||
"""
|
"""
|
||||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||||
|
|
||||||
@@ -788,6 +788,10 @@ def test_compaction_waits_for_upload(
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
env.pageserver.allowed_errors.append(
|
||||||
|
f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
|
||||||
|
)
|
||||||
|
|
||||||
tenant_id = env.initial_tenant
|
tenant_id = env.initial_tenant
|
||||||
timeline_id = env.initial_timeline
|
timeline_id = env.initial_timeline
|
||||||
|
|
||||||
@@ -808,76 +812,9 @@ def test_compaction_waits_for_upload(
|
|||||||
endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
|
endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
|
||||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||||
|
|
||||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
with pytest.raises(ReadTimeout):
|
||||||
deltas_at_first = len(client.layer_map_info(tenant_id, timeline_id).delta_layers())
|
client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
|
||||||
assert (
|
client.configure_failpoints(("before-upload-layer-pausable", "off"))
|
||||||
deltas_at_first == 2
|
|
||||||
), "are you fixing #5863? just add one more checkpoint after 'CREATE TABLE bar ...' statement."
|
|
||||||
|
|
||||||
endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)")
|
|
||||||
endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")
|
|
||||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
|
||||||
|
|
||||||
layers_before_last_checkpoint = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
|
|
||||||
upload_stuck_layers = layers_before_last_checkpoint - layers_at_creation.historic_by_name()
|
|
||||||
|
|
||||||
assert len(upload_stuck_layers) > 0
|
|
||||||
|
|
||||||
for name in upload_stuck_layers:
|
|
||||||
assert env.pageserver.layer_exists(
|
|
||||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
|
||||||
), "while uploads are stuck the layers should be present on disk"
|
|
||||||
|
|
||||||
# now this will do the L0 => L1 compaction and want to remove
|
|
||||||
# upload_stuck_layers and the original initdb L0
|
|
||||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
|
||||||
|
|
||||||
# as uploads are paused, the upload_stuck_layers should still be with us
|
|
||||||
for name in upload_stuck_layers:
|
|
||||||
assert env.pageserver.layer_exists(
|
|
||||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
|
||||||
), "uploads are stuck still over compaction"
|
|
||||||
|
|
||||||
compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
|
|
||||||
overlap = compacted_layers.intersection(upload_stuck_layers)
|
|
||||||
assert len(overlap) == 0, "none of the L0's should remain after L0 => L1 compaction"
|
|
||||||
assert (
|
|
||||||
len(compacted_layers) == 1
|
|
||||||
), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)"
|
|
||||||
|
|
||||||
def layer_deletes_completed():
|
|
||||||
m = client.get_metric_value("pageserver_layer_completed_deletes_total")
|
|
||||||
if m is None:
|
|
||||||
return 0
|
|
||||||
return int(m)
|
|
||||||
|
|
||||||
# if initdb created an initial delta layer, it might already be gc'd
|
|
||||||
# because it was uploaded before the failpoint was enabled. however, the
|
|
||||||
# deletion is not guaranteed to be complete.
|
|
||||||
assert layer_deletes_completed() <= 1
|
|
||||||
|
|
||||||
client.configure_failpoints(("before-upload-layer-pausable", "off"))
|
|
||||||
|
|
||||||
# Ensure that this actually terminates
|
|
||||||
wait_upload_queue_empty(client, tenant_id, timeline_id)
|
|
||||||
|
|
||||||
def until_layer_deletes_completed():
|
|
||||||
deletes = layer_deletes_completed()
|
|
||||||
log.info(f"layer_deletes: {deletes}")
|
|
||||||
# ensure that initdb delta layer AND the previously stuck are now deleted
|
|
||||||
assert deletes >= len(upload_stuck_layers) + 1
|
|
||||||
|
|
||||||
wait_until(10, 1, until_layer_deletes_completed)
|
|
||||||
|
|
||||||
for name in upload_stuck_layers:
|
|
||||||
assert not env.pageserver.layer_exists(
|
|
||||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
|
||||||
), "l0 should now be removed because of L0 => L1 compaction and completed uploads"
|
|
||||||
|
|
||||||
# We should not have hit the error handling path in uploads where a uploaded file is gone
|
|
||||||
assert not env.pageserver.log_contains(
|
|
||||||
"File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def wait_upload_queue_empty(
|
def wait_upload_queue_empty(
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import threading
|
|||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import Any, Dict, List, Union
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from fixtures.common_types import TenantId, TenantShardId, TimelineId
|
from fixtures.common_types import TenantId, TenantShardId, TimelineId
|
||||||
@@ -1785,6 +1785,126 @@ def test_storage_controller_node_deletion(
|
|||||||
env.storage_controller.consistency_check()
|
env.storage_controller.consistency_check()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("shard_count", [None, 2])
|
||||||
|
def test_storage_controller_metadata_health(
|
||||||
|
neon_env_builder: NeonEnvBuilder,
|
||||||
|
shard_count: Optional[int],
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Create three tenants A, B, C.
|
||||||
|
|
||||||
|
Phase 1:
|
||||||
|
- A: Post healthy status.
|
||||||
|
- B: Post unhealthy status.
|
||||||
|
- C: No updates.
|
||||||
|
|
||||||
|
Phase 2:
|
||||||
|
- B: Post healthy status.
|
||||||
|
- C: Post healthy status.
|
||||||
|
|
||||||
|
Phase 3:
|
||||||
|
- A: Post unhealthy status.
|
||||||
|
|
||||||
|
Phase 4:
|
||||||
|
- Delete tenant A, metadata health status should be deleted as well.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def update_and_query_metadata_health(
|
||||||
|
env: NeonEnv,
|
||||||
|
healthy: List[TenantShardId],
|
||||||
|
unhealthy: List[TenantShardId],
|
||||||
|
outdated_duration: str = "1h",
|
||||||
|
) -> Tuple[Set[str], Set[str]]:
|
||||||
|
"""
|
||||||
|
Update metadata health. Then list tenant shards with unhealthy and
|
||||||
|
outdated metadata health status.
|
||||||
|
"""
|
||||||
|
if healthy or unhealthy:
|
||||||
|
env.storage_controller.metadata_health_update(healthy, unhealthy)
|
||||||
|
result = env.storage_controller.metadata_health_list_unhealthy()
|
||||||
|
unhealthy_res = set(result["unhealthy_tenant_shards"])
|
||||||
|
result = env.storage_controller.metadata_health_list_outdated(outdated_duration)
|
||||||
|
outdated_res = set(record["tenant_shard_id"] for record in result["health_records"])
|
||||||
|
|
||||||
|
return unhealthy_res, outdated_res
|
||||||
|
|
||||||
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||||
|
|
||||||
|
neon_env_builder.num_pageservers = 2
|
||||||
|
env = neon_env_builder.init_start()
|
||||||
|
|
||||||
|
# Mock tenant (`initial_tenant``) with healthy scrubber scan result
|
||||||
|
tenant_a_shard_ids = (
|
||||||
|
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count)
|
||||||
|
if shard_count is not None
|
||||||
|
else [TenantShardId(env.initial_tenant, 0, 0)]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mock tenant with unhealthy scrubber scan result
|
||||||
|
tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count)
|
||||||
|
tenant_b_shard_ids = (
|
||||||
|
env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count)
|
||||||
|
if shard_count is not None
|
||||||
|
else [TenantShardId(tenant_b, 0, 0)]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mock tenant that never gets a health update from scrubber
|
||||||
|
tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count)
|
||||||
|
|
||||||
|
tenant_c_shard_ids = (
|
||||||
|
env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count)
|
||||||
|
if shard_count is not None
|
||||||
|
else [TenantShardId(tenant_c, 0, 0)]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Metadata health table also updated as tenant shards are created.
|
||||||
|
assert env.storage_controller.metadata_health_is_healthy()
|
||||||
|
|
||||||
|
# post "fake" updates to storage controller db
|
||||||
|
|
||||||
|
unhealthy, outdated = update_and_query_metadata_health(
|
||||||
|
env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids
|
||||||
|
)
|
||||||
|
|
||||||
|
log.info(f"After Phase 1: {unhealthy=}, {outdated=}")
|
||||||
|
assert len(unhealthy) == len(tenant_b_shard_ids)
|
||||||
|
for t in tenant_b_shard_ids:
|
||||||
|
assert str(t) in unhealthy
|
||||||
|
assert len(outdated) == 0
|
||||||
|
|
||||||
|
unhealthy, outdated = update_and_query_metadata_health(
|
||||||
|
env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[]
|
||||||
|
)
|
||||||
|
|
||||||
|
log.info(f"After Phase 2: {unhealthy=}, {outdated=}")
|
||||||
|
assert len(unhealthy) == 0
|
||||||
|
assert len(outdated) == 0
|
||||||
|
|
||||||
|
unhealthy, outdated = update_and_query_metadata_health(
|
||||||
|
env, healthy=[], unhealthy=tenant_a_shard_ids
|
||||||
|
)
|
||||||
|
|
||||||
|
log.info(f"After Phase 3: {unhealthy=}, {outdated=}")
|
||||||
|
assert len(unhealthy) == len(tenant_a_shard_ids)
|
||||||
|
for t in tenant_a_shard_ids:
|
||||||
|
assert str(t) in unhealthy
|
||||||
|
assert len(outdated) == 0
|
||||||
|
|
||||||
|
# Phase 4: Delete A
|
||||||
|
env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant)
|
||||||
|
|
||||||
|
# A's unhealthy metadata health status should be deleted as well.
|
||||||
|
assert env.storage_controller.metadata_health_is_healthy()
|
||||||
|
|
||||||
|
# All shards from B and C are not fresh if set outdated duration to 0 seconds.
|
||||||
|
unhealthy, outdated = update_and_query_metadata_health(
|
||||||
|
env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s"
|
||||||
|
)
|
||||||
|
assert len(unhealthy) == 0
|
||||||
|
for t in tenant_b_shard_ids + tenant_c_shard_ids:
|
||||||
|
assert str(t) in outdated
|
||||||
|
|
||||||
|
|
||||||
def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
|
def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
|
||||||
"""
|
"""
|
||||||
Test the `/control/v1/step_down` storage controller API. Upon receiving such
|
Test the `/control/v1/step_down` storage controller API. Upon receiving such
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ from fixtures.neon_fixtures import (
|
|||||||
NeonEnv,
|
NeonEnv,
|
||||||
NeonEnvBuilder,
|
NeonEnvBuilder,
|
||||||
)
|
)
|
||||||
|
from fixtures.pg_version import PgVersion
|
||||||
from fixtures.remote_storage import S3Storage, s3_storage
|
from fixtures.remote_storage import S3Storage, s3_storage
|
||||||
from fixtures.utils import wait_until
|
from fixtures.utils import wait_until
|
||||||
from fixtures.workload import Workload
|
from fixtures.workload import Workload
|
||||||
@@ -265,10 +266,85 @@ def test_scrubber_physical_gc_ancestors(
|
|||||||
# attach it, to drop any local state, then check it's still readable.
|
# attach it, to drop any local state, then check it's still readable.
|
||||||
workload.stop()
|
workload.stop()
|
||||||
drop_local_state(env, tenant_id)
|
drop_local_state(env, tenant_id)
|
||||||
|
|
||||||
workload.validate()
|
workload.validate()
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder):
|
||||||
|
"""
|
||||||
|
When we delete a timeline after a shard split, the child shards do not directly delete the
|
||||||
|
layers in the ancestor shards. They rely on the scrubber to clean up.
|
||||||
|
"""
|
||||||
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||||
|
neon_env_builder.num_pageservers = 2
|
||||||
|
|
||||||
|
env = neon_env_builder.init_configs()
|
||||||
|
env.start()
|
||||||
|
|
||||||
|
tenant_id = TenantId.generate()
|
||||||
|
timeline_id = TimelineId.generate()
|
||||||
|
env.neon_cli.create_tenant(
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
shard_count=None,
|
||||||
|
conf={
|
||||||
|
# Small layers and low compaction thresholds, so that when we split we can expect some to
|
||||||
|
# be dropped by child shards
|
||||||
|
"checkpoint_distance": f"{1024 * 1024}",
|
||||||
|
"compaction_threshold": "1",
|
||||||
|
"compaction_target_size": f"{1024 * 1024}",
|
||||||
|
"image_creation_threshold": "2",
|
||||||
|
"image_layer_creation_check_threshold": "0",
|
||||||
|
# Disable background compaction, we will do it explicitly
|
||||||
|
"compaction_period": "0s",
|
||||||
|
# No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
|
||||||
|
# and makes them GC'able
|
||||||
|
"pitr_interval": "0s",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make sure the original shard has some layers
|
||||||
|
workload = Workload(env, tenant_id, timeline_id)
|
||||||
|
workload.init()
|
||||||
|
workload.write_rows(100)
|
||||||
|
|
||||||
|
new_shard_count = 4
|
||||||
|
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
||||||
|
|
||||||
|
# Create a second timeline so that when we delete the first one, child shards still have some content in S3.
|
||||||
|
#
|
||||||
|
# This is a limitation of the scrubber: if a shard isn't in S3 (because it has no timelines), then the scrubber
|
||||||
|
# doesn't know about it, and won't perceive its ancestors as ancestors.
|
||||||
|
other_timeline_id = TimelineId.generate()
|
||||||
|
env.storage_controller.pageserver_api().timeline_create(
|
||||||
|
PgVersion.NOT_SET, tenant_id, other_timeline_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Write after split so that child shards have some indices in S3
|
||||||
|
workload.write_rows(100, upload=False)
|
||||||
|
for shard in shards:
|
||||||
|
ps = env.get_tenant_pageserver(shard)
|
||||||
|
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
|
||||||
|
ps.http_client().timeline_checkpoint(
|
||||||
|
shard, timeline_id, compact=False, wait_until_uploaded=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# The timeline still exists in child shards and they reference its layers, so scrubbing
|
||||||
|
# now shouldn't delete anything.
|
||||||
|
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
|
||||||
|
assert gc_summary["remote_storage_errors"] == 0
|
||||||
|
assert gc_summary["indices_deleted"] == 0
|
||||||
|
assert gc_summary["ancestor_layers_deleted"] == 0
|
||||||
|
|
||||||
|
# Delete the timeline
|
||||||
|
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timeline_id)
|
||||||
|
|
||||||
|
# Subsequently doing physical GC should clean up the ancestor layers
|
||||||
|
gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full")
|
||||||
|
assert gc_summary["remote_storage_errors"] == 0
|
||||||
|
assert gc_summary["indices_deleted"] == 0
|
||||||
|
assert gc_summary["ancestor_layers_deleted"] > 0
|
||||||
|
|
||||||
|
|
||||||
def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
|
def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
|
||||||
"""
|
"""
|
||||||
Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards
|
Exercise ancestor GC while a tenant is partly split: this test ensures that if we have some child shards
|
||||||
@@ -440,10 +516,12 @@ def test_scrubber_scan_pageserver_metadata(
|
|||||||
assert len(index.layer_metadata) > 0
|
assert len(index.layer_metadata) > 0
|
||||||
it = iter(index.layer_metadata.items())
|
it = iter(index.layer_metadata.items())
|
||||||
|
|
||||||
scan_summary = env.storage_scrubber.scan_metadata()
|
scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
|
||||||
assert not scan_summary["with_warnings"]
|
assert not scan_summary["with_warnings"]
|
||||||
assert not scan_summary["with_errors"]
|
assert not scan_summary["with_errors"]
|
||||||
|
|
||||||
|
assert env.storage_controller.metadata_health_is_healthy()
|
||||||
|
|
||||||
# Delete a layer file that is listed in the index.
|
# Delete a layer file that is listed in the index.
|
||||||
layer, metadata = next(it)
|
layer, metadata = next(it)
|
||||||
log.info(f"Deleting {timeline_path}/{layer.to_str()}")
|
log.info(f"Deleting {timeline_path}/{layer.to_str()}")
|
||||||
@@ -453,7 +531,17 @@ def test_scrubber_scan_pageserver_metadata(
|
|||||||
)
|
)
|
||||||
log.info(f"delete response: {delete_response}")
|
log.info(f"delete response: {delete_response}")
|
||||||
|
|
||||||
# Check scan summary. Expect it to be a L0 layer so only emit warnings.
|
# Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
|
||||||
scan_summary = env.storage_scrubber.scan_metadata()
|
scan_summary = env.storage_scrubber.scan_metadata()
|
||||||
log.info(f"{pprint.pformat(scan_summary)}")
|
log.info(f"{pprint.pformat(scan_summary)}")
|
||||||
assert len(scan_summary["with_warnings"]) > 0
|
assert len(scan_summary["with_warnings"]) > 0
|
||||||
|
|
||||||
|
assert env.storage_controller.metadata_health_is_healthy()
|
||||||
|
|
||||||
|
# Now post to storage controller, expect seeing one unhealthy health record
|
||||||
|
scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
|
||||||
|
log.info(f"{pprint.pformat(scan_summary)}")
|
||||||
|
assert len(scan_summary["with_warnings"]) > 0
|
||||||
|
|
||||||
|
unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
|
||||||
|
assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
|
||||||
|
|||||||
@@ -277,8 +277,12 @@ files:
|
|||||||
help: 'Bytes between received and replayed LSN'
|
help: 'Bytes between received and replayed LSN'
|
||||||
key_labels:
|
key_labels:
|
||||||
values: [replication_delay_bytes]
|
values: [replication_delay_bytes]
|
||||||
|
# We use a GREATEST call here because this calculation can be negative.
|
||||||
|
# The calculation is not atomic, meaning after we've gotten the receive
|
||||||
|
# LSN, the replay LSN may have advanced past the receive LSN we
|
||||||
|
# are using for the calculation.
|
||||||
query: |
|
query: |
|
||||||
SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes;
|
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||||
|
|
||||||
- metric_name: replication_delay_seconds
|
- metric_name: replication_delay_seconds
|
||||||
type: gauge
|
type: gauge
|
||||||
|
|||||||
Reference in New Issue
Block a user