mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-26 23:00:37 +00:00
Compare commits
558 Commits
jcsp/layer
...
release-61
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
28ee7cdede | ||
|
|
7b63092958 | ||
|
|
31bfeaf934 | ||
|
|
21b3a191bf | ||
|
|
f7f9b4aaec | ||
|
|
bba062e262 | ||
|
|
067363fe95 | ||
|
|
affe408433 | ||
|
|
9b883e4651 | ||
|
|
b98b301d56 | ||
|
|
ed7ee73cba | ||
|
|
fceace835b | ||
|
|
1b508a6082 | ||
|
|
f87b031876 | ||
|
|
9f1ba2c4bf | ||
|
|
9868bb3346 | ||
|
|
27da0e9cf5 | ||
|
|
de9bf2af6c | ||
|
|
3d2c2ce139 | ||
|
|
82a2081d61 | ||
|
|
ff174a88c0 | ||
|
|
ef3ebfaf67 | ||
|
|
ae1af558b4 | ||
|
|
c150ad4ee2 | ||
|
|
a98ccd185b | ||
|
|
9f796ebba9 | ||
|
|
d51ca338c4 | ||
|
|
07e78102bf | ||
|
|
b21e131d11 | ||
|
|
abe3b4e005 | ||
|
|
18e7c2b7a1 | ||
|
|
ad5d784fb7 | ||
|
|
85d47637ee | ||
|
|
7e818ee390 | ||
|
|
bff505426e | ||
|
|
bf7de92dc2 | ||
|
|
9dc71f5a88 | ||
|
|
2ede9d7a25 | ||
|
|
ea5460843c | ||
|
|
5b16624bcc | ||
|
|
349373cb11 | ||
|
|
957f99cad5 | ||
|
|
2a3a136474 | ||
|
|
cfaf30f5e8 | ||
|
|
72c2d0812e | ||
|
|
537ecf45f8 | ||
|
|
1637a6ee05 | ||
|
|
4e547e6274 | ||
|
|
3d582b212a | ||
|
|
3fbb84d741 | ||
|
|
a4fa250c92 | ||
|
|
39aeb10cfc | ||
|
|
44781518d0 | ||
|
|
16071e57c6 | ||
|
|
392d3524f9 | ||
|
|
c96e8012ce | ||
|
|
5a772761ee | ||
|
|
841b76ea7c | ||
|
|
a4434cf1c0 | ||
|
|
d263b1804e | ||
|
|
b461755326 | ||
|
|
9ded2556df | ||
|
|
7672e49ab5 | ||
|
|
a2d170b6d0 | ||
|
|
1303d47778 | ||
|
|
e250b9e063 | ||
|
|
0c236fa465 | ||
|
|
da84a250c6 | ||
|
|
975f8ac658 | ||
|
|
839a5724a4 | ||
|
|
f2b8e390e7 | ||
|
|
f7131834eb | ||
|
|
4a90423292 | ||
|
|
f4f0869dc8 | ||
|
|
0950866fa8 | ||
|
|
7cf59ae5b4 | ||
|
|
b197cc20fc | ||
|
|
ba17025a57 | ||
|
|
b5ab055526 | ||
|
|
a40b402957 | ||
|
|
d2ee760eb2 | ||
|
|
66337097de | ||
|
|
e6dadcd2f3 | ||
|
|
83e07c1a5b | ||
|
|
ee263e6a62 | ||
|
|
7eb37fea26 | ||
|
|
730db859c7 | ||
|
|
04448ac323 | ||
|
|
324e4e008f | ||
|
|
d74fb7b879 | ||
|
|
b49b450dc4 | ||
|
|
7973c3e941 | ||
|
|
085bbaf5f8 | ||
|
|
85b5219861 | ||
|
|
7472c69954 | ||
|
|
3f8819827c | ||
|
|
c440756410 | ||
|
|
0e600eb921 | ||
|
|
a1df835e28 | ||
|
|
119ddf6ccf | ||
|
|
90f447b79d | ||
|
|
7dd71f4126 | ||
|
|
8532d72276 | ||
|
|
d3ff47f572 | ||
|
|
8cc768254f | ||
|
|
5c80743c9c | ||
|
|
5bba3e3c75 | ||
|
|
6caf702417 | ||
|
|
32f668f5e7 | ||
|
|
a91f9d5832 | ||
|
|
547acde6cd | ||
|
|
bea6532881 | ||
|
|
8e2fe6b22e | ||
|
|
4d75e1ef81 | ||
|
|
4c7c00268c | ||
|
|
f28abb953d | ||
|
|
4df39d7304 | ||
|
|
bfc7338246 | ||
|
|
35dac6e6c8 | ||
|
|
e619e8703e | ||
|
|
6fd35bfe32 | ||
|
|
547a431b0d | ||
|
|
f8c01c6341 | ||
|
|
1145700f87 | ||
|
|
44339f5b70 | ||
|
|
7b4a9c1d82 | ||
|
|
3b2fc27de4 | ||
|
|
0b6492e7d3 | ||
|
|
7cfaecbeb6 | ||
|
|
472acae615 | ||
|
|
108bf56e44 | ||
|
|
e83a499ab4 | ||
|
|
ebf3bfadde | ||
|
|
ab06240fae | ||
|
|
cec216c5c0 | ||
|
|
930201e033 | ||
|
|
8328580dc2 | ||
|
|
8d9b632f2a | ||
|
|
55d37c77b9 | ||
|
|
0948fb6bf1 | ||
|
|
285c6d2974 | ||
|
|
8a8b83df27 | ||
|
|
4bdfb96078 | ||
|
|
8da3b547f8 | ||
|
|
b329b1c610 | ||
|
|
4184685721 | ||
|
|
411a130675 | ||
|
|
0645ae318e | ||
|
|
86d6ef305a | ||
|
|
2e37aa3fe8 | ||
|
|
30bbfde50d | ||
|
|
82b9a44ab4 | ||
|
|
4a87bac036 | ||
|
|
38b4ed297e | ||
|
|
cd29156927 | ||
|
|
814c8e8f68 | ||
|
|
0159ae9536 | ||
|
|
d9a82468e2 | ||
|
|
e26ef640c1 | ||
|
|
c11b9cb43d | ||
|
|
69b6675da0 | ||
|
|
6bbd34a216 | ||
|
|
24f8133e89 | ||
|
|
9f4511c554 | ||
|
|
e78341e1c2 | ||
|
|
98387d6fb1 | ||
|
|
1afab13ccb | ||
|
|
e89ec55ea5 | ||
|
|
fe13fccdc2 | ||
|
|
1a49f1c15c | ||
|
|
9bb16c8780 | ||
|
|
3f7aebb01c | ||
|
|
abc330e095 | ||
|
|
6d3cb222ee | ||
|
|
b1fe8259b4 | ||
|
|
4a5b55c834 | ||
|
|
73fa3c014b | ||
|
|
c196cf6ac1 | ||
|
|
8b15864f59 | ||
|
|
d9c1068cf4 | ||
|
|
811eb88b89 | ||
|
|
a5491463e1 | ||
|
|
df3dc6e4c1 | ||
|
|
daea26a22f | ||
|
|
84b039e615 | ||
|
|
a58827f952 | ||
|
|
36b790f282 | ||
|
|
3ef7748e6b | ||
|
|
f3310143e4 | ||
|
|
05b4169644 | ||
|
|
d1495755e7 | ||
|
|
c8dd78c6c8 | ||
|
|
b44ee3950a | ||
|
|
64334f497d | ||
|
|
5ffcb688cc | ||
|
|
32fc2dd683 | ||
|
|
d35ddfbab7 | ||
|
|
3ee82a9895 | ||
|
|
e770aeee92 | ||
|
|
32828cddd6 | ||
|
|
bd2046e1ab | ||
|
|
7e2a3d2728 | ||
|
|
0e4832308d | ||
|
|
0a63bc4818 | ||
|
|
2897dcc9aa | ||
|
|
1d0ec50ddb | ||
|
|
a86b43fcd7 | ||
|
|
b917868ada | ||
|
|
7b7d16f52e | ||
|
|
fee4169b6b | ||
|
|
47e06a2cc6 | ||
|
|
c4423c0623 | ||
|
|
a11cf03123 | ||
|
|
08b33adfee | ||
|
|
4fb50144dd | ||
|
|
c500137ca9 | ||
|
|
252c4acec9 | ||
|
|
db70c175e6 | ||
|
|
ed3b4a58b4 | ||
|
|
2863d1df63 | ||
|
|
320b24eab3 | ||
|
|
13a8a5b09b | ||
|
|
64ccdf65e0 | ||
|
|
1ae6aa09dd | ||
|
|
aeb68e51df | ||
|
|
c3e5223a5d | ||
|
|
daaa3211a4 | ||
|
|
7ff9989dd5 | ||
|
|
ed3b97604c | ||
|
|
47c50ec460 | ||
|
|
8c0ec2f681 | ||
|
|
588bda98e7 | ||
|
|
504ca7720f | ||
|
|
cf4ea92aad | ||
|
|
325294bced | ||
|
|
86c8ba2563 | ||
|
|
a68edad913 | ||
|
|
fcdf060816 | ||
|
|
1c57f6bac3 | ||
|
|
b54dd9af15 | ||
|
|
118847cd41 | ||
|
|
f2ec542954 | ||
|
|
2a3410d1c3 | ||
|
|
1121a1cbac | ||
|
|
154ba5e1b4 | ||
|
|
27fe7f8963 | ||
|
|
feeb2dc6fa | ||
|
|
57f476ff5a | ||
|
|
7ee2bebdb7 | ||
|
|
be598f1bf4 | ||
|
|
939b5954a5 | ||
|
|
371020fe6a | ||
|
|
f45818abed | ||
|
|
0384267d58 | ||
|
|
62b3bd968a | ||
|
|
e3e3bc3542 | ||
|
|
be014a2222 | ||
|
|
2e1fe71cc0 | ||
|
|
068c158ca5 | ||
|
|
b16e4f689f | ||
|
|
dbff725a0c | ||
|
|
7fa4628434 | ||
|
|
fc538a38b9 | ||
|
|
c2e7cb324f | ||
|
|
101043122e | ||
|
|
c4d7d59825 | ||
|
|
0de1e1d664 | ||
|
|
271598b77f | ||
|
|
459bc479dc | ||
|
|
c213373a59 | ||
|
|
e0addc100d | ||
|
|
0519138b04 | ||
|
|
5da39b469c | ||
|
|
82027e22dd | ||
|
|
c431e2f1c5 | ||
|
|
4e5724d9c3 | ||
|
|
0d3e499059 | ||
|
|
7b860b837c | ||
|
|
41fc96e20f | ||
|
|
fb2b1ce57b | ||
|
|
464717451b | ||
|
|
c6ed86d3d0 | ||
|
|
f0a9017008 | ||
|
|
bb7949ba00 | ||
|
|
1df0f69664 | ||
|
|
970066a914 | ||
|
|
1ebd3897c0 | ||
|
|
6460beffcd | ||
|
|
6f7f8958db | ||
|
|
936a00e077 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
@@ -9,8 +9,8 @@ inputs:
|
||||
description: 'Region ID, if not set the project will be created in the default region'
|
||||
default: aws-us-east-2
|
||||
postgres_version:
|
||||
description: 'Postgres version; default is 15'
|
||||
default: '15'
|
||||
description: 'Postgres version; default is 16'
|
||||
default: '16'
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
|
||||
@@ -115,6 +115,7 @@ runs:
|
||||
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
|
||||
export DEFAULT_PG_VERSION=${PG_VERSION#v}
|
||||
export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
|
||||
export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
|
||||
|
||||
if [ "${BUILD_TYPE}" = "remote" ]; then
|
||||
export REMOTE_ENV=1
|
||||
|
||||
186
.github/workflows/benchmarking.yml
vendored
186
.github/workflows/benchmarking.yml
vendored
@@ -56,15 +56,27 @@ concurrency:
|
||||
jobs:
|
||||
bench:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "neon-staging"
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
provisioner: 'k8s-pod'
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "azure-staging"
|
||||
region_id: 'azure-eastus2'
|
||||
provisioner: 'k8s-neonvm'
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-staging"
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
@@ -85,9 +97,10 @@ jobs:
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
region_id: ${{ matrix.region_id }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
provisioner: ${{ matrix.provisioner }}
|
||||
|
||||
- name: Run benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -96,10 +109,18 @@ jobs:
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
# Set --sparse-ordering option of pytest-order plugin
|
||||
# to ensure tests are running in order of appears in the file.
|
||||
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
|
||||
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
|
||||
extra_params:
|
||||
-m remote_cluster
|
||||
--sparse-ordering
|
||||
--timeout 14400
|
||||
--ignore test_runner/performance/test_perf_olap.py
|
||||
--ignore test_runner/performance/test_perf_pgvector_queries.py
|
||||
--ignore test_runner/performance/test_logical_replication.py
|
||||
--ignore test_runner/performance/test_physical_replication.py
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -125,6 +146,71 @@ jobs:
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
replication-tests:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-staging"
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Run benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_logical_replication.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Run benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_physical_replication.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
generate-matrices:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
@@ -132,11 +218,14 @@ jobs:
|
||||
# Available platforms:
|
||||
# - neon-captest-new: Freshly created project (1 CU)
|
||||
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
|
||||
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
|
||||
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
|
||||
# - neon-captest-reuse: Reusing existing project
|
||||
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
env:
|
||||
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
|
||||
DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
@@ -147,23 +236,33 @@ jobs:
|
||||
- name: Generate matrix for pgbench benchmark
|
||||
id: pgbench-compare-matrix
|
||||
run: |
|
||||
region_id_default=${{ env.DEFAULT_REGION_ID }}
|
||||
matrix='{
|
||||
"pg_version" : [
|
||||
16
|
||||
],
|
||||
"region_id" : [
|
||||
"'"$region_id_default"'"
|
||||
],
|
||||
"platform": [
|
||||
"neon-captest-new",
|
||||
"neon-captest-reuse",
|
||||
"neonvm-captest-new"
|
||||
],
|
||||
"db_size": [ "10gb" ],
|
||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
|
||||
{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -215,7 +314,7 @@ jobs:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
@@ -240,14 +339,14 @@ jobs:
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
region_id: ${{ matrix.region_id }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
|
||||
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
|
||||
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
|
||||
|
||||
- name: Set up Connection String
|
||||
@@ -260,7 +359,7 @@ jobs:
|
||||
neonvm-captest-sharding-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
||||
;;
|
||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -285,6 +384,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -298,6 +398,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -311,6 +412,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -337,6 +439,13 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
pgbench-pgvector:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- PLATFORM: "neon-captest-pgvector"
|
||||
- PLATFORM: "azure-captest-pgvector"
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "1"
|
||||
@@ -344,8 +453,9 @@ jobs:
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-captest-pgvector"
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
@@ -355,17 +465,39 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
# until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
|
||||
# instead of using Neon artifacts containing pgbench
|
||||
- name: Install postgresql-16 where pytest expects it
|
||||
run: |
|
||||
cd /home/nonroot
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb
|
||||
dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
|
||||
mkdir -p /tmp/neon/pg_install/v16/bin
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
|
||||
ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib
|
||||
/tmp/neon/pg_install/v16/bin/pgbench --version
|
||||
/tmp/neon/pg_install/v16/bin/psql --version
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-pgvector)
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||
;;
|
||||
azure-captest-pgvector)
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
@@ -377,6 +509,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -390,6 +523,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -404,11 +538,10 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
|
||||
clickbench-compare:
|
||||
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
|
||||
# we use for performance testing in pgbench-compare.
|
||||
@@ -652,6 +785,7 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
3
.github/workflows/build_and_test.yml
vendored
3
.github/workflows/build_and_test.yml
vendored
@@ -1336,6 +1336,7 @@ jobs:
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
PREFIX: artifacts/latest
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
run: |
|
||||
# Update compatibility snapshot for the release
|
||||
for pg_version in v14 v15 v16; do
|
||||
@@ -1349,7 +1350,7 @@ jobs:
|
||||
|
||||
# Update Neon artifact for the release (reuse already uploaded artifact)
|
||||
for build_type in debug release; do
|
||||
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
|
||||
OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
|
||||
FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
|
||||
183
Cargo.lock
generated
183
Cargo.lock
generated
@@ -1236,6 +1236,7 @@ dependencies = [
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest 0.12.4",
|
||||
"rlimit",
|
||||
"rust-ini",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -1367,6 +1368,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"whoami",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -1397,9 +1399,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crc32c"
|
||||
version = "0.6.5"
|
||||
version = "0.6.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
|
||||
checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
|
||||
dependencies = [
|
||||
"rustc_version",
|
||||
]
|
||||
@@ -1651,6 +1653,16 @@ dependencies = [
|
||||
"rusticata-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
|
||||
dependencies = [
|
||||
"powerfmt",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "desim"
|
||||
version = "0.1.0"
|
||||
@@ -2017,16 +2029,6 @@ dependencies = [
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs2"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fsevent-sys"
|
||||
version = "4.1.0"
|
||||
@@ -3008,9 +3010,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "measured"
|
||||
version = "0.0.21"
|
||||
version = "0.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
|
||||
checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"crossbeam-utils",
|
||||
@@ -3026,9 +3028,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "measured-derive"
|
||||
version = "0.0.21"
|
||||
version = "0.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
|
||||
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
|
||||
dependencies = [
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
@@ -3038,9 +3040,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "measured-process"
|
||||
version = "0.0.21"
|
||||
version = "0.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
|
||||
checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"measured",
|
||||
@@ -3232,16 +3234,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
||||
dependencies = [
|
||||
"overload",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.4.1"
|
||||
@@ -3275,6 +3267,12 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-conv"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.45"
|
||||
@@ -3531,12 +3529,6 @@ version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "p256"
|
||||
version = "0.11.1"
|
||||
@@ -3667,6 +3659,7 @@ dependencies = [
|
||||
"sysinfo",
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-epoll-uring",
|
||||
"tokio-io-timeout",
|
||||
@@ -4077,6 +4070,7 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -4117,6 +4111,12 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "powerfmt"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.17"
|
||||
@@ -4389,6 +4389,7 @@ dependencies = [
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"typed-json",
|
||||
"url",
|
||||
"urlencoding",
|
||||
"utils",
|
||||
@@ -4587,6 +4588,15 @@ dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.2"
|
||||
@@ -4877,6 +4887,15 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rlimit"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "routerify"
|
||||
version = "3.0.0"
|
||||
@@ -5145,7 +5164,6 @@ dependencies = [
|
||||
"crc32c",
|
||||
"desim",
|
||||
"fail",
|
||||
"fs2",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
@@ -5172,6 +5190,8 @@ dependencies = [
|
||||
"sha2",
|
||||
"signal-hook",
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
@@ -5396,9 +5416,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.183"
|
||||
version = "1.0.203"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
|
||||
checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
@@ -5415,9 +5435,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.183"
|
||||
version = "1.0.203"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
|
||||
checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -5786,6 +5806,28 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storage_controller_client"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres",
|
||||
"reqwest 0.12.4",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storage_scrubber"
|
||||
version = "0.1.0"
|
||||
@@ -5820,6 +5862,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"storage_controller_client",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
@@ -5849,6 +5892,7 @@ dependencies = [
|
||||
"reqwest 0.12.4",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"storage_controller_client",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -6107,12 +6151,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.21"
|
||||
version = "0.3.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
|
||||
checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
|
||||
dependencies = [
|
||||
"deranged",
|
||||
"itoa",
|
||||
"js-sys",
|
||||
"num-conv",
|
||||
"powerfmt",
|
||||
"serde",
|
||||
"time-core",
|
||||
"time-macros",
|
||||
@@ -6120,16 +6167,17 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "time-core"
|
||||
version = "0.1.1"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
|
||||
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
|
||||
|
||||
[[package]]
|
||||
name = "time-macros"
|
||||
version = "0.2.9"
|
||||
version = "0.2.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
|
||||
checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
|
||||
dependencies = [
|
||||
"num-conv",
|
||||
"time-core",
|
||||
]
|
||||
|
||||
@@ -6472,17 +6520,6 @@ version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
|
||||
|
||||
[[package]]
|
||||
name = "trace"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"pageserver_api",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.37"
|
||||
@@ -6582,7 +6619,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
@@ -6647,6 +6683,16 @@ dependencies = [
|
||||
"static_assertions",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typed-json"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.16.0"
|
||||
@@ -6943,6 +6989,12 @@ version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasite"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.92"
|
||||
@@ -7095,6 +7147,17 @@ dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "whoami"
|
||||
version = "1.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
|
||||
dependencies = [
|
||||
"redox_syscall 0.4.1",
|
||||
"wasite",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
@@ -7427,13 +7490,12 @@ dependencies = [
|
||||
"clap",
|
||||
"clap_builder",
|
||||
"crossbeam-utils",
|
||||
"deranged",
|
||||
"either",
|
||||
"fail",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"getrandom 0.2.11",
|
||||
"hashbrown 0.14.5",
|
||||
@@ -7451,7 +7513,9 @@ dependencies = [
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"parquet",
|
||||
"proc-macro2",
|
||||
"prost",
|
||||
"quote",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"regex-automata 0.4.3",
|
||||
@@ -7468,6 +7532,7 @@ dependencies = [
|
||||
"syn 1.0.109",
|
||||
"syn 2.0.52",
|
||||
"sync_wrapper",
|
||||
"tikv-jemalloc-sys",
|
||||
"time",
|
||||
"time-macros",
|
||||
"tokio",
|
||||
|
||||
12
Cargo.toml
12
Cargo.toml
@@ -13,9 +13,9 @@ members = [
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
"storage_controller",
|
||||
"storage_controller/client",
|
||||
"storage_scrubber",
|
||||
"workspace_hack",
|
||||
"trace",
|
||||
"libs/compute_api",
|
||||
"libs/pageserver_api",
|
||||
"libs/postgres_ffi",
|
||||
@@ -84,7 +84,6 @@ enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
fallible-iterator = "0.2"
|
||||
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
@@ -111,8 +110,8 @@ lasso = "0.7"
|
||||
leaky-bucket = "1.0.1"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.21", features=["lasso"] }
|
||||
measured-process = { version = "0.0.21" }
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.8"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
@@ -184,14 +183,16 @@ tower-service = "0.3.2"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.21.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
typed-json = "0.1"
|
||||
url = "2.2"
|
||||
urlencoding = "2.1"
|
||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
rustls-native-certs = "0.7"
|
||||
x509-parser = "0.15"
|
||||
whoami = "1.5.1"
|
||||
|
||||
## TODO replace this with tracing
|
||||
env_logger = "0.10"
|
||||
@@ -221,6 +222,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||
desim = { version = "0.1", path = "./libs/desim" }
|
||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||
storage_controller_client = { path = "./storage_controller/client" }
|
||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
|
||||
@@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
|
||||
FROM build-deps AS rum-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/rum.patch /rum.patch
|
||||
|
||||
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
|
||||
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
|
||||
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /rum.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
||||
|
||||
@@ -44,3 +44,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||
zstd = "0.13"
|
||||
bytes = "1.0"
|
||||
rust-ini = "0.20.0"
|
||||
rlimit = "0.10.1"
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
//! - Every start is a fresh start, so the data directory is removed and
|
||||
//! initialized again on each run.
|
||||
//! - If remote_extension_config is provided, it will be used to fetch extensions list
|
||||
//! and download `shared_preload_libraries` from the remote storage.
|
||||
//! and download `shared_preload_libraries` from the remote storage.
|
||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||
//! - Sync safekeepers and get commit LSN.
|
||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||
@@ -33,7 +33,6 @@
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
@@ -64,6 +63,7 @@ use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
use compute_tools::swap::resize_swap;
|
||||
use rlimit::{setrlimit, Resource};
|
||||
|
||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||
// in-case of not-set environment var
|
||||
@@ -72,6 +72,9 @@ const BUILD_TAG_DEFAULT: &str = "latest";
|
||||
fn main() -> Result<()> {
|
||||
let (build_tag, clap_args) = init()?;
|
||||
|
||||
// enable core dumping for all child processes
|
||||
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
|
||||
|
||||
let (pg_handle, start_pg_result) = {
|
||||
// Enter startup tracing context
|
||||
let _startup_context_guard = startup_context_from_env();
|
||||
|
||||
@@ -56,6 +56,7 @@ pub struct ComputeNode {
|
||||
/// - we push new spec and it does reconfiguration
|
||||
/// - but then something happens and compute pod / VM is destroyed,
|
||||
/// so k8s controller starts it again with the **old** spec
|
||||
///
|
||||
/// and the same for empty computes:
|
||||
/// - we started compute without any spec
|
||||
/// - we push spec and it does configuration
|
||||
@@ -798,7 +799,11 @@ impl ComputeNode {
|
||||
// In this case we need to connect with old `zenith_admin` name
|
||||
// and create new user. We cannot simply rename connected user,
|
||||
// but we can create a new one and grant it all privileges.
|
||||
let connstr = self.connstr.clone();
|
||||
let mut connstr = self.connstr.clone();
|
||||
connstr
|
||||
.query_pairs_mut()
|
||||
.append_pair("application_name", "apply_config");
|
||||
|
||||
let mut client = match Client::connect(connstr.as_str(), NoTls) {
|
||||
Err(e) => match e.code() {
|
||||
Some(&SqlState::INVALID_PASSWORD)
|
||||
@@ -867,6 +872,11 @@ impl ComputeNode {
|
||||
|
||||
// Run migrations separately to not hold up cold starts
|
||||
thread::spawn(move || {
|
||||
let mut connstr = connstr.clone();
|
||||
connstr
|
||||
.query_pairs_mut()
|
||||
.append_pair("application_name", "migrations");
|
||||
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
handle_migrations(&mut client).context("apply_config handle_migrations")
|
||||
});
|
||||
@@ -1107,7 +1117,7 @@ impl ComputeNode {
|
||||
// EKS worker nodes have following core dump settings:
|
||||
// /proc/sys/kernel/core_pattern -> core
|
||||
// /proc/sys/kernel/core_uses_pid -> 1
|
||||
// ulimint -c -> unlimited
|
||||
// ulimit -c -> unlimited
|
||||
// which results in core dumps being written to postgres data directory as core.<pid>.
|
||||
//
|
||||
// Use that as a default location and pattern, except macos where core dumps are written
|
||||
@@ -1386,7 +1396,9 @@ pub fn forward_termination_signal() {
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
// use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
|
||||
kill(pg_pid, Signal::SIGQUIT).ok();
|
||||
// Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
|
||||
// ROs to get a list of running xacts faster instead of going through the CLOG.
|
||||
// See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
|
||||
kill(pg_pid, Signal::SIGINT).ok();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod extension_server;
|
||||
mod migration;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
|
||||
105
compute_tools/src/migration.rs
Normal file
105
compute_tools/src/migration.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
use anyhow::{Context, Result};
|
||||
use postgres::Client;
|
||||
use tracing::info;
|
||||
|
||||
pub(crate) struct MigrationRunner<'m> {
|
||||
client: &'m mut Client,
|
||||
migrations: &'m [&'m str],
|
||||
}
|
||||
|
||||
impl<'m> MigrationRunner<'m> {
|
||||
pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
|
||||
// The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
|
||||
assert!(migrations.len() + 1 < i64::MAX as usize);
|
||||
|
||||
Self { client, migrations }
|
||||
}
|
||||
|
||||
fn get_migration_id(&mut self) -> Result<i64> {
|
||||
let query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = self
|
||||
.client
|
||||
.query_one(query, &[])
|
||||
.context("run_migrations get migration_id")?;
|
||||
|
||||
Ok(row.get::<&str, i64>("id"))
|
||||
}
|
||||
|
||||
fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
|
||||
let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
|
||||
|
||||
self.client
|
||||
.simple_query(&setval)
|
||||
.context("run_migrations update id")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn prepare_migrations(&mut self) -> Result<()> {
|
||||
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
self.client.simple_query(query)?;
|
||||
|
||||
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
self.client.simple_query(query)?;
|
||||
|
||||
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
self.client.simple_query(query)?;
|
||||
|
||||
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
self.client.simple_query(query)?;
|
||||
|
||||
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
self.client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn run_migrations(mut self) -> Result<()> {
|
||||
self.prepare_migrations()?;
|
||||
|
||||
let mut current_migration = self.get_migration_id()? as usize;
|
||||
while current_migration < self.migrations.len() {
|
||||
macro_rules! migration_id {
|
||||
($cm:expr) => {
|
||||
($cm + 1) as i64
|
||||
};
|
||||
}
|
||||
|
||||
let migration = self.migrations[current_migration];
|
||||
|
||||
if migration.starts_with("-- SKIP") {
|
||||
info!("Skipping migration id={}", migration_id!(current_migration));
|
||||
} else {
|
||||
info!(
|
||||
"Running migration id={}:\n{}\n",
|
||||
migration_id!(current_migration),
|
||||
migration
|
||||
);
|
||||
|
||||
self.client
|
||||
.simple_query("BEGIN")
|
||||
.context("begin migration")?;
|
||||
|
||||
self.client.simple_query(migration).with_context(|| {
|
||||
format!(
|
||||
"run_migrations migration id={}",
|
||||
migration_id!(current_migration)
|
||||
)
|
||||
})?;
|
||||
|
||||
// Migration IDs start at 1
|
||||
self.update_migration_id(migration_id!(current_migration))?;
|
||||
|
||||
self.client
|
||||
.simple_query("COMMIT")
|
||||
.context("commit migration")?;
|
||||
|
||||
info!("Finished migration id={}", migration_id!(current_migration));
|
||||
}
|
||||
|
||||
current_migration += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
|
||||
END IF;
|
||||
END $$;
|
||||
@@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
|
||||
|
||||
use crate::config;
|
||||
use crate::logger::inlinify;
|
||||
use crate::migration::MigrationRunner;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
@@ -776,84 +777,25 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
|
||||
// Add new migrations in numerical order.
|
||||
let migrations = [
|
||||
include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
|
||||
include_str!("./migrations/0001-alter_roles.sql"),
|
||||
include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
|
||||
include_str!("./migrations/0002-alter_roles.sql"),
|
||||
include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
|
||||
include_str!(
|
||||
"./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
|
||||
"./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!(
|
||||
"./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
||||
"./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
|
||||
include_str!(
|
||||
"./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
|
||||
),
|
||||
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
|
||||
];
|
||||
|
||||
let mut func = || {
|
||||
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
func().context("handle_migrations prepare")?;
|
||||
|
||||
let query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client
|
||||
.query_one(query, &[])
|
||||
.context("handle_migrations get migration_id")?;
|
||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
||||
let starting_migration_id = current_migration;
|
||||
|
||||
let query = "BEGIN";
|
||||
client
|
||||
.simple_query(query)
|
||||
.context("handle_migrations begin")?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
let migration = &migrations[current_migration];
|
||||
if migration.starts_with("-- SKIP") {
|
||||
info!("Skipping migration id={}", current_migration);
|
||||
} else {
|
||||
info!(
|
||||
"Running migration id={}:\n{}\n",
|
||||
current_migration, migration
|
||||
);
|
||||
client.simple_query(migration).with_context(|| {
|
||||
format!("handle_migrations current_migration={}", current_migration)
|
||||
})?;
|
||||
}
|
||||
current_migration += 1;
|
||||
}
|
||||
let setval = format!(
|
||||
"UPDATE neon_migration.migration_id SET id={}",
|
||||
migrations.len()
|
||||
);
|
||||
client
|
||||
.simple_query(&setval)
|
||||
.context("handle_migrations update id")?;
|
||||
|
||||
let query = "COMMIT";
|
||||
client
|
||||
.simple_query(query)
|
||||
.context("handle_migrations commit")?;
|
||||
|
||||
info!(
|
||||
"Ran {} migrations",
|
||||
(migrations.len() - starting_migration_id)
|
||||
);
|
||||
MigrationRunner::new(client, &migrations).run_migrations()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -40,6 +40,7 @@ safekeeper_api.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
storage_broker.workspace = true
|
||||
utils.workspace = true
|
||||
whoami.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
//! Code to manage the storage broker
|
||||
//!
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//! In the local test environment, the storage broker stores its data directly in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! .neon
|
||||
//! ```
|
||||
use std::time::Duration;
|
||||
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
//! Code to manage pageservers
|
||||
//!
|
||||
//! In the local test environment, the pageserver stores its data directly in
|
||||
//! In the local test environment, the data for each pageserver is stored in
|
||||
//!
|
||||
//! .neon/
|
||||
//! ```text
|
||||
//! .neon/pageserver_<pageserver_id>
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -15,7 +17,6 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::models::{
|
||||
self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
@@ -350,11 +351,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<NonZeroU64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||
trace_read_requests: settings
|
||||
.remove("trace_read_requests")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
eviction_policy: settings
|
||||
.remove("eviction_policy")
|
||||
.map(serde_json::from_str)
|
||||
@@ -455,11 +451,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<NonZeroU64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||
trace_read_requests: settings
|
||||
.remove("trace_read_requests")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
eviction_policy: settings
|
||||
.remove("eviction_policy")
|
||||
.map(serde_json::from_str)
|
||||
@@ -566,60 +557,39 @@ impl PageServerNode {
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let (client, conn) = self.page_server_psql_client().await?;
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = conn.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
let client = std::pin::pin!(client);
|
||||
|
||||
// Init base reader
|
||||
let (start_lsn, base_tarfile_path) = base;
|
||||
let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
|
||||
let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
|
||||
let base_tarfile =
|
||||
mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
|
||||
|
||||
// Init wal reader if necessary
|
||||
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
|
||||
let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
|
||||
let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
|
||||
let wal_reader =
|
||||
mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
|
||||
(end_lsn, Some(wal_reader))
|
||||
} else {
|
||||
(start_lsn, None)
|
||||
};
|
||||
|
||||
let copy_in = |reader, cmd| {
|
||||
let client = &client;
|
||||
async move {
|
||||
let writer = client.copy_in(&cmd).await?;
|
||||
let writer = std::pin::pin!(writer);
|
||||
let mut writer = writer.sink_map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
|
||||
});
|
||||
let mut reader = std::pin::pin!(reader);
|
||||
writer.send_all(&mut reader).await?;
|
||||
writer.into_inner().finish().await?;
|
||||
anyhow::Ok(())
|
||||
}
|
||||
};
|
||||
|
||||
// Import base
|
||||
copy_in(
|
||||
base_tarfile,
|
||||
format!(
|
||||
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
|
||||
),
|
||||
)
|
||||
.await?;
|
||||
// Import wal if necessary
|
||||
if let Some(wal_reader) = wal_reader {
|
||||
copy_in(
|
||||
wal_reader,
|
||||
format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
|
||||
self.http_client
|
||||
.import_basebackup(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
pg_version,
|
||||
base_tarfile,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Import wal if necessary
|
||||
if let Some(wal_reader) = wal_reader {
|
||||
self.http_client
|
||||
.import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -29,7 +29,6 @@ use utils::{
|
||||
pub struct StorageController {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: Utf8PathBuf,
|
||||
private_key: Option<Vec<u8>>,
|
||||
public_key: Option<String>,
|
||||
postgres_port: u16,
|
||||
@@ -41,6 +40,8 @@ const COMMAND: &str = "storage_controller";
|
||||
|
||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -65,10 +66,6 @@ pub struct InspectResponse {
|
||||
|
||||
impl StorageController {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachments.json");
|
||||
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
let listen_url = env.control_plane_api.clone().unwrap();
|
||||
@@ -128,7 +125,6 @@ impl StorageController {
|
||||
|
||||
Self {
|
||||
env: env.clone(),
|
||||
path,
|
||||
listen,
|
||||
private_key,
|
||||
public_key,
|
||||
@@ -203,7 +199,6 @@ impl StorageController {
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
@@ -232,6 +227,30 @@ impl StorageController {
|
||||
Ok(database_url)
|
||||
}
|
||||
|
||||
pub async fn connect_to_database(
|
||||
&self,
|
||||
) -> anyhow::Result<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
)> {
|
||||
tokio_postgres::Config::new()
|
||||
.host("localhost")
|
||||
.port(self.postgres_port)
|
||||
// The user is the ambient operating system user name.
|
||||
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
|
||||
//
|
||||
// Until we get there, use the ambient operating system user name.
|
||||
// Recent tokio-postgres versions default to this if the user isn't specified.
|
||||
// But tokio-postgres fork doesn't have this upstream commit:
|
||||
// https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
|
||||
// => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
|
||||
.user(&whoami::username())
|
||||
.dbname(DB_NAME)
|
||||
.connect(tokio_postgres::NoTls)
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
@@ -256,18 +275,21 @@ impl StorageController {
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
};
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
//
|
||||
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
||||
// from `LocalEnv`'s config file (`.neon/config`).
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
@@ -296,11 +318,38 @@ impl StorageController {
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
// We support running a startup SQL script to fiddle with the database before we launch storcon.
|
||||
// This is used by the test suite.
|
||||
let startup_script_path = self
|
||||
.env
|
||||
.base_data_dir
|
||||
.join("storage_controller_db.startup.sql");
|
||||
let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
|
||||
Ok(script) => {
|
||||
tokio::fs::remove_file(startup_script_path).await?;
|
||||
script
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// always run some startup script so that this code path doesn't bit rot
|
||||
"BEGIN; COMMIT;".to_string()
|
||||
} else {
|
||||
anyhow::bail!("Failed to read startup script: {e}")
|
||||
}
|
||||
}
|
||||
};
|
||||
let (mut client, conn) = self.connect_to_database().await?;
|
||||
let conn = tokio::spawn(conn);
|
||||
let tx = client.build_transaction();
|
||||
let tx = tx.start().await?;
|
||||
tx.batch_execute(&startup_script).await?;
|
||||
tx.commit().await?;
|
||||
drop(client);
|
||||
conn.await??;
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
"-p",
|
||||
self.path.as_ref(),
|
||||
"--dev",
|
||||
"--database-url",
|
||||
&database_url,
|
||||
|
||||
@@ -17,6 +17,7 @@ pageserver_client.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
storage_controller_client.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -14,15 +14,15 @@ use pageserver_api::{
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
||||
use pageserver_client::mgmt_api::{self};
|
||||
use reqwest::{Method, StatusCode, Url};
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
use storage_controller_client::control_api::Client;
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Command {
|
||||
@@ -56,6 +56,10 @@ enum Command {
|
||||
#[arg(long)]
|
||||
scheduling: Option<NodeSchedulingPolicy>,
|
||||
},
|
||||
NodeDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Modify a tenant's policies in the storage controller
|
||||
TenantPolicy {
|
||||
#[arg(long)]
|
||||
@@ -245,64 +249,6 @@ impl FromStr for NodeAvailabilityArg {
|
||||
}
|
||||
}
|
||||
|
||||
struct Client {
|
||||
base_url: Url,
|
||||
jwt_token: Option<String>,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
|
||||
Self {
|
||||
base_url,
|
||||
jwt_token,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> mgmt_api::Result<RS>
|
||||
where
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
self.base_url.host_str().unwrap(),
|
||||
self.base_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
builder = builder.json(&body)
|
||||
}
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
}
|
||||
|
||||
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
|
||||
let response = response.error_from_body().await?;
|
||||
|
||||
response
|
||||
.json()
|
||||
.await
|
||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
@@ -337,7 +283,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
Command::TenantCreate { tenant_id } => {
|
||||
storcon_client
|
||||
.dispatch(
|
||||
.dispatch::<_, ()>(
|
||||
Method::POST,
|
||||
"v1/tenant".to_string(),
|
||||
Some(TenantCreateRequest {
|
||||
@@ -357,13 +303,16 @@ async fn main() -> anyhow::Result<()> {
|
||||
tracing::info!("Delete status: {}", status);
|
||||
}
|
||||
Command::Nodes {} => {
|
||||
let resp = storcon_client
|
||||
let mut resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
||||
for node in resp {
|
||||
@@ -395,13 +344,16 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
Command::Tenants {} => {
|
||||
let resp = storcon_client
|
||||
let mut resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/tenant".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"TenantId",
|
||||
@@ -650,6 +602,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
|
||||
.await?;
|
||||
}
|
||||
Command::NodeDelete { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantSetTimeBasedEviction {
|
||||
tenant_id,
|
||||
period,
|
||||
|
||||
252
docs/rfcs/034-ancestor-deletion.md
Normal file
252
docs/rfcs/034-ancestor-deletion.md
Normal file
@@ -0,0 +1,252 @@
|
||||
# Ancestor Timeline Deletion
|
||||
|
||||
Created on: 2024-02-23
|
||||
|
||||
Author: John Spray
|
||||
|
||||
# Summary
|
||||
|
||||
When a tenant creates a new timeline that they will treat as their 'main' history,
|
||||
it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
|
||||
this is necessary because it is forbidden to delete a timeline which has descendents.
|
||||
|
||||
A new pageserver API is proposed to 'adopt' data from a parent timeline into
|
||||
one of its children, such that the link between ancestor and child can be severed,
|
||||
leaving the parent in a state where it may then be deleted.
|
||||
|
||||
# Motivation
|
||||
|
||||
Retaining parent timelines currently has two costs:
|
||||
|
||||
- Cognitive load on users, who have to remember which is the "real" main timeline.
|
||||
- Storage capacity cost, as the parent timeline will retain layers up to the
|
||||
child's timeline point, even if the child fully covers its keyspace with image
|
||||
layers and will never actually read from the parent.
|
||||
|
||||
# Solution
|
||||
|
||||
A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
|
||||
will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
|
||||
wish to detach from its parent.
|
||||
|
||||
On success, this API will leave the following state:
|
||||
|
||||
- The detached child timeline will no longer have an ancestor, and will contain all
|
||||
the data needed to service reads without recursing into an ancestor.
|
||||
- Any other children of the parent whose timeline points were at a lower LSN than
|
||||
the detached child timeline will be modified to have the child timeline as their
|
||||
new parent.
|
||||
- The parent timeline will still exist, but the child will no longer have it as an
|
||||
ancestor. If this was the last timeline that depended on the parent, then the
|
||||
parent will become deletable.
|
||||
|
||||
This API's implementation will consist of a series of retryable steps, such that
|
||||
on failures/timeout it can safely be called again to reach the target state.
|
||||
|
||||
## Example
|
||||
|
||||
### Before
|
||||
|
||||
The user has "rolled back" their project to LSN X, resulting in a "new main"
|
||||
timeline. The parent "old main" timeline still exists, and they would like
|
||||
to clean it up.
|
||||
|
||||
They have two other timelines A and B. A is from before the rollback point,
|
||||
and B is from after the rollback point.
|
||||
|
||||
```
|
||||
----"old main" timeline-------X-------------------------------------------->
|
||||
| | |
|
||||
|-> child A | |
|
||||
|-> "new main" timeline |
|
||||
-> child B
|
||||
|
||||
```
|
||||
|
||||
### After calling detach ancestor API
|
||||
|
||||
The "new main" timeline is no longer dependent on old main, and neither
|
||||
is child A, because it had a branch point before X.
|
||||
|
||||
The user may now choose to delete child B and "old main" to get to
|
||||
a pristine state. Child B is likely to be unwanted since the user
|
||||
chose to roll back to X, and it branches from after X. However, we
|
||||
don't assume this in the API; it is up to the user to delete it.
|
||||
|
||||
```
|
||||
|----"old main" timeline---------------------------------------------------->
|
||||
|
|
||||
|
|
||||
|
|
||||
-> child B
|
||||
|
||||
|----"new main" timeline--------->
|
||||
|
|
||||
|-> child A
|
||||
|
||||
|
||||
```
|
||||
|
||||
### After removing timelines
|
||||
|
||||
We end up with a totally clean state that leaves no trace that a rollback
|
||||
ever happened: there is only one root timeline.
|
||||
|
||||
```
|
||||
| ----"new main" timeline----------->
|
||||
|
|
||||
|-> child A
|
||||
|
||||
|
||||
```
|
||||
|
||||
## Caveats
|
||||
|
||||
Important things for API users to bear in mind:
|
||||
|
||||
- this API does not delete the parent timeline: you must still do that explicitly.
|
||||
- if there are other child timelines ahead of the branch point of the detached
|
||||
child, the parent won't be deletable: you must either delete or detach those
|
||||
children.
|
||||
- do _not_ simply loop over all children and detach them all: this can have an
|
||||
extremely high storage cost. The detach ancestor API is intended for use on a single
|
||||
timeline to make it the new "main".
|
||||
- The detach ancestor API should also not be
|
||||
exposed directly to the user as button/API, because they might decide
|
||||
to click it for all the children and thereby generate many copies of the
|
||||
parent's data -- the detach ancestor API should be used as part
|
||||
of a high level "clean up after rollback" feature.
|
||||
|
||||
## `detach_ancestor` API implementation
|
||||
|
||||
Terms used in the following sections:
|
||||
|
||||
- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
|
||||
called "new main" in the example.
|
||||
- "the parent": the parent of "the child". Also called "old main" in the example.
|
||||
- "the branch point" the ancestor_lsn of "the child"
|
||||
|
||||
### Phase 1: write out adopted layers to S3
|
||||
|
||||
The child will "adopt" layers from the parent, such that its end state contains
|
||||
all the parent's history as well as its own.
|
||||
|
||||
For all layers in the parent's layer map whose high LSN is below the branch
|
||||
point, issue S3 CopyObject requests to duplicate them into the child timeline's
|
||||
prefix. Do not add them to the child's layer map yet.
|
||||
|
||||
For delta layers in the parent's layer map which straddle the branch point, read them
|
||||
and write out only content up to the branch point into new layer objects.
|
||||
|
||||
This is a long running operation if the parent has many layers: it should be
|
||||
implemented in a way that resumes rather than restarting from scratch, if the API
|
||||
times out and is called again.
|
||||
|
||||
As an optimization, if there are no other timelines that will be adopted into
|
||||
the child, _and_ the child's image layers already full cover the branch LSN,
|
||||
then we may skip adopting layers.
|
||||
|
||||
### Phase 2: update the child's index
|
||||
|
||||
Having written out all needed layers in phase 1, atomically link them all
|
||||
into the child's IndexPart and upload to S3. This may be done while the
|
||||
child Timeline is still running.
|
||||
|
||||
### Phase 3: modify timelines ancestry
|
||||
|
||||
Modify the child's ancestor to None, and upload its IndexPart to persist the change.
|
||||
|
||||
For all timelines which have the same parent as the child, and have a branch
|
||||
point lower than our branch point, switch their ancestor_timeline to the child,
|
||||
and upload their IndexPart to persist the change.
|
||||
|
||||
## Alternatives considered
|
||||
|
||||
### Generate full image layer on child, rather than adopting parent deltas
|
||||
|
||||
This would work for the case of a single child, but would prevent re-targeting
|
||||
other timelines that depended on the parent. If we detached many children this
|
||||
way, the storage cost would become prohibitive (consider a 1TB database with
|
||||
100 child timelines: it would cost 100TiB if they all generated their own image layers).
|
||||
|
||||
### Don't rewrite anything: just fake it in the API
|
||||
|
||||
We could add a layer of indirection that let a child "pretend" that it had no
|
||||
ancestor, when in reality it still had the parent. The pageserver API could
|
||||
accept deletion of ancestor timelines, and just update child metadata to make
|
||||
them look like they have no ancestor.
|
||||
|
||||
This would not achieve the desired reduction in storage cost, and may well be more
|
||||
complex to maintain than simply implementing the API described in this RFC.
|
||||
|
||||
### Avoid copying objects: enable child index to use parent layers directly
|
||||
|
||||
We could teach IndexPart to store a TimelineId for each layer, such that a child
|
||||
timeline could reference a parent's layers directly, rather than copying them
|
||||
into the child's prefix.
|
||||
|
||||
This would impose a cost for the normal case of indices that only target the
|
||||
timeline's own layers, add complexity, and break the useful simplifying
|
||||
invariant that timelines "own" their own path. If child timelines were
|
||||
referencing layers from the parent, we would have to ensure that the parent
|
||||
never runs GC/compaction again, which would make the API less flexible (the
|
||||
proposal in this RFC enables deletion of the parent but doesn't require it.)
|
||||
|
||||
## Performance
|
||||
|
||||
### Adopting layers
|
||||
|
||||
- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
|
||||
of such requests: this can take up to tens of seconds and will compete for RemoteStorage
|
||||
semaphore units with other activity on the pageserver.
|
||||
- If we are running on storage backend that doesn't implement CopyObject, then
|
||||
this part will be much more expensive as we would stream all layer content
|
||||
through the pageserver. This is no different to issuing a lot
|
||||
of reads to a timeline that does not have a warm local cache: it will move
|
||||
a lot of gigabytes, but that shouldn't break anything.
|
||||
- Generating truncated layers for delta that straddle the branch point will
|
||||
require streaming read/write of all the layers in question.
|
||||
|
||||
### Updating timeline ancestry
|
||||
|
||||
The simplest way to update timeline ancestry will probably be to stop and start
|
||||
all the Timeline objects: this is preferable to the complexity of making their
|
||||
ancestry mutable at runtime.
|
||||
|
||||
There will be a corresponding "stutter" in the availability of the timelines,
|
||||
of the order 10-100ms, which is the time taken to upload their IndexPart, and
|
||||
restart the Timeline.
|
||||
|
||||
# Interaction with other features
|
||||
|
||||
## Concurrent timeline creation
|
||||
|
||||
If new historic timelines are created using the parent as an ancestor while the
|
||||
detach ancestor API is running, they will not be re-parented to the child. This
|
||||
doesn't break anything, but it leaves the parent in a state where it might not
|
||||
be possible to delete it.
|
||||
|
||||
Since timeline creations are an explicit user action, this is not something we need to
|
||||
worry about as the storage layer: a user who wants to delete their parent timeline will not create
|
||||
new children, and if they do, they can choose to delete those children to
|
||||
enable deleting the parent.
|
||||
|
||||
For the least surprise to the user, before starting the detach ancestor branch
|
||||
operation, the control plane should wait until all branches are created and not
|
||||
allow any branches to be created before the branch point on the ancestor branch
|
||||
while the operation is ongoing.
|
||||
|
||||
## WAL based disaster recovery
|
||||
|
||||
WAL based disaster recovery currently supports only restoring of the main
|
||||
branch. Enabling WAL based disaster recovery in the future requires that we
|
||||
keep a record which timeline generated the WAL and at which LSN was a parent
|
||||
detached. Keep a list of timeline ids and the LSN in which they were detached in
|
||||
the `index_part.json`. Limit the size of the list to 100 first entries, after
|
||||
which the WAL disaster recovery will not be possible.
|
||||
|
||||
## Sharded tenants
|
||||
|
||||
For sharded tenants, calls to the detach ancestor API will pass through the storage
|
||||
controller, which will handle them the same as timeline creations: invoke first
|
||||
on shard zero, and then on all the other shards.
|
||||
507
docs/rfcs/034-timeline-archive.md
Normal file
507
docs/rfcs/034-timeline-archive.md
Normal file
@@ -0,0 +1,507 @@
|
||||
# Timeline Archival
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC describes a mechanism for pageservers to eliminate local storage + compute work
|
||||
for timelines which are not in use, in response to external API calls to "archive" a timeline.
|
||||
|
||||
The archived state roughly corresponds to fully offloading a timeline to object storage, such
|
||||
that its cost is purely the cost of that object storage.
|
||||
|
||||
## Motivation
|
||||
|
||||
Archived timelines serve multiple purposes:
|
||||
- Act as a 'snapshot' for workloads that would like to retain restorable copies of their
|
||||
database from longer ago than their PITR window.
|
||||
- Enable users to create huge numbers of branches (e.g. one per github PR) without having
|
||||
to diligently clean them up later to avoid overloading the pageserver (currently we support
|
||||
up to ~500 branches per tenant).
|
||||
|
||||
### Prior art
|
||||
|
||||
Most storage and database systems have some form of snapshot, which can be implemented several ways:
|
||||
1. full copies of data (e.g. an EBS snapshot to S3)
|
||||
2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
|
||||
3. a series of snapshots which are CoW or de-duplicated relative to one another.
|
||||
|
||||
Today's Neon branches are approximately like `2.`, although due to implementation details branches
|
||||
often end up storing much more data than they really need, as parent branches assume that all data
|
||||
at the branch point is needed. The layers pinned in the parent branch may have a much larger size
|
||||
than the physical size of a compressed image layer representing the data at the branch point.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Enter & exit the archived state in response to external admin API calls
|
||||
- API calls to modify the archived state are atomic and durable
|
||||
- An archived timeline should eventually (once out of PITR window) use an efficient compressed
|
||||
representation, and avoid retaining arbitrarily large data in its parent branch.
|
||||
- Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
|
||||
but must not scale with the number of _archived_ branches.
|
||||
- Background I/O for archived branches should only be done a limited number of times to evolve them
|
||||
to a long-term-efficient state (e.g. rewriting to image layers). There should be no ongoing "housekeeping"
|
||||
overhead for archived branches, including operations related to calculating sizes for billing.
|
||||
- The pageserver should put no load on the safekeeper for archived branches.
|
||||
- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
|
||||
to a performant state in a short time (linear with the branch's logical size)
|
||||
|
||||
## Non Goals
|
||||
|
||||
- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
|
||||
in Neon's internal format.
|
||||
- Compute cold starts after activating an archived branch will not have comparable performance to
|
||||
cold starts on an active branch.
|
||||
- Archived branches will not use any new/additional compression or de-duplication beyond what
|
||||
is already implemented for image layers (zstd per page).
|
||||
- The pageserver will not "auto start" archived branches in response to page_service API requests: they
|
||||
are only activated explicitly via the HTTP API.
|
||||
- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
|
||||
remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
|
||||
- We will not expose any prometheus metrics for archived timelines, or make them visible in any
|
||||
detailed HTTP APIs other than the specific API for listing archived timelines.
|
||||
- A parent branch may not be archived unless all its children are.
|
||||
|
||||
## Impacted Components
|
||||
|
||||
pageserver, storage controller
|
||||
|
||||
## Terminology
|
||||
|
||||
**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
|
||||
may assume that this branch is now very cheap to store, although this may not be physically so until the
|
||||
branch proceeds to the offloaded state.
|
||||
|
||||
**Active** branches are branches which are available for use by page_service clients, and have a relatively
|
||||
high cost due to consuming local storage.
|
||||
|
||||
**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
|
||||
that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
|
||||
|
||||
**Activate** (verb): transition from Archived to Active
|
||||
|
||||
**Archive** (verb): transition from Active to Archived
|
||||
|
||||
**Offload** (verb): transition from Archived to Offloaded
|
||||
|
||||
**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
|
||||
|
||||
**Warm up** (verb): operation done on an active branch, by downloading its active layers. Once a branch is
|
||||
warmed up, good performance will be available to page_service clients.
|
||||
|
||||
## Implementation
|
||||
|
||||
### High level flow
|
||||
|
||||
We may think of a timeline which is archived and then activated as proceeding through a series of states:
|
||||
|
||||
```mermaid
|
||||
stateDiagram
|
||||
[*] --> Active(warm)
|
||||
Active(warm) --> Archived
|
||||
Archived --> Offloaded
|
||||
Archived --> Active(warm)
|
||||
Offloaded --> Active(cold)
|
||||
Active(cold) --> Active(warm)
|
||||
```
|
||||
|
||||
Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
|
||||
of branches will be:
|
||||
- Very frequent: Short lived branches: Active -> Deleted
|
||||
- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
|
||||
- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
|
||||
|
||||
These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
|
||||
of:
|
||||
- the timeline's lifecycle state: active or archived, stored in the timeline's index
|
||||
- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
|
||||
manifest of offloaded timelines.
|
||||
- cache state (whether it's warm or cold).
|
||||
|
||||
### Storage format changes
|
||||
|
||||
There are two storage format changes:
|
||||
1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
|
||||
be considered active or archived.
|
||||
2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
|
||||
at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
|
||||
|
||||
The manifest object will have a format like this:
|
||||
```
|
||||
{
|
||||
"offload_timelines": [
|
||||
{
|
||||
"timeline_id": ...
|
||||
"last_record_lsn": ...
|
||||
"last_record_lsn_time": ...
|
||||
"pitr_interval": ...
|
||||
"last_gc_lsn": ... # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
|
||||
"logical_size": ... # The size at last_record_lsn
|
||||
"physical_size" ...
|
||||
"parent": Option<{
|
||||
"timeline_id"...
|
||||
"lsn"... # Branch point LSN on the parent
|
||||
"requires_data": bool # True if this branch depends on layers in its parent, identify it here
|
||||
|
||||
}>
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The information about a timeline in its offload state is intentionally minimal: just enough to decide:
|
||||
- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
|
||||
by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
|
||||
- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
|
||||
layers that the archived branch depends on
|
||||
- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
|
||||
is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
|
||||
we don't need to go to S3 for the deletion.
|
||||
- How much archived space to report in consumption metrics
|
||||
|
||||
The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
|
||||
set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
|
||||
(offloaded timelines).
|
||||
|
||||
For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
|
||||
index_part objects are (see [generation numbers RFC](025-generation-numbers.md)). This will add some complexity, but
|
||||
give us total safety against two pageservers with the same tenant attached fighting over the object. Existing code
|
||||
for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
|
||||
the manifest file.
|
||||
|
||||
### API & Timeline state
|
||||
|
||||
Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart. This will
|
||||
be controlled by a new per-timeline `configure` endpoint. This is intentionally generic naming, which
|
||||
may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
|
||||
a per-timeline configuration).
|
||||
|
||||
`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
|
||||
```
|
||||
{
|
||||
'state': 'active|archive'
|
||||
}
|
||||
```
|
||||
|
||||
When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
|
||||
|
||||
When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
|
||||
**and** the `Timeline` object has been instantiated and activated. This will require reading the timeline's
|
||||
index, but not any data: it should be about as fast as a couple of small S3 requests.
|
||||
|
||||
The API will be available with identical path via the storage controller: calling this on a sharded tenant
|
||||
will simply map the API call to all the shards.
|
||||
|
||||
Archived timelines may never have descendent timelines which are active. This will be enforced at the API level,
|
||||
such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
|
||||
that all its descendents are archived. It is the callers responsibility to walk the hierarchy of timelines
|
||||
in the proper order if they would like to archive whole trees of branches.
|
||||
|
||||
Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
|
||||
for archived timelines will be added: this is for use in support/debug:
|
||||
|
||||
```
|
||||
GET /v1/tenants/{tenant_id}/archived_timelines
|
||||
|
||||
{
|
||||
...same per-timeline content as the tenant manifest...
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
### Tenant attach changes
|
||||
|
||||
Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
|
||||
we load their index_part.json. To avoid the number of GETs scaling linearly with the number of archived
|
||||
timelines, we must have a single object that tells us which timelines do not need to be loaded. The
|
||||
number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
|
||||
because each request covers 1000 timelines.
|
||||
|
||||
This is **not** literally the same as the set of timelines who have state=archived. Rather, it is
|
||||
the set of timelines which have been offloaded in the background after their state was set to archived.
|
||||
|
||||
We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
|
||||
exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
|
||||
to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
|
||||
to delete an offloaded timeline.
|
||||
|
||||
### Warm-up API
|
||||
|
||||
`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
|
||||
|
||||
This API will be similar to the existing `download_remote_layers` API, but smarter:
|
||||
- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
|
||||
- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
|
||||
of downloads, so that the caller can poll.
|
||||
|
||||
The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
|
||||
of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
|
||||
can possibly be read from these LSNs. This concept of layer visibility is more generally useful for cache
|
||||
eviction and heatmaps, as well as in this specific case of warming up a timeline.
|
||||
|
||||
The caller does not have to wait for the warm up API, or call it at all. But it is strongly advised
|
||||
to call it, because otherwise populating local contents for a timeline can take a long time when waiting
|
||||
for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
|
||||
volatile.
|
||||
|
||||
### Background work
|
||||
|
||||
Archived branches are not subject to normal compaction. Instead, when the compaction loop encounters
|
||||
an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
|
||||
([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
|
||||
if its state permits that.
|
||||
|
||||
Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
|
||||
optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
|
||||
has elapsed and it can now be rewritten to image layers.
|
||||
|
||||
#### Archive branch offload
|
||||
|
||||
Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
|
||||
any actual work.
|
||||
|
||||
This work is done in the background compaction loop. It makes sense to tag this work on to the compaction
|
||||
loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
|
||||
|
||||
The condition for offload is simple:
|
||||
- a `Timeline` object exists with state `Archived`
|
||||
- the timeline does not have any non-offloaded children.
|
||||
|
||||
Regarding the condition that children must be offloaded, this will always be eventually true, because
|
||||
we enforce at the API level that children of archived timelines must themselves be archived, and all
|
||||
archived timelines will eventually be offloaded.
|
||||
|
||||
Offloading a timeline is simple:
|
||||
- Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
|
||||
- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
|
||||
- Erase all the timeline's content from local storage (`remove_dir_all` on its path)
|
||||
- Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
|
||||
|
||||
#### Archive branch optimization (flattening)
|
||||
|
||||
When we offloaded a branch, it might have had some history that prevented rewriting it to a single
|
||||
point in time set of image layers. For example, a branch might have several days of writes and a 7
|
||||
day PITR: when we archive it, it still has those days of history.
|
||||
|
||||
Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
|
||||
- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
|
||||
a point in time compared with delta layers
|
||||
- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
|
||||
for data, i.e. the ancestor is free to GC layers files at+below the branch point
|
||||
|
||||
Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
|
||||
branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
|
||||
a true snapshot at that LSN.
|
||||
|
||||
It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
|
||||
is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
|
||||
|
||||
Archive branch optimization should be done _before_ background offloads during compaction, because there may
|
||||
be timelines which are ready to be offloaded but also would benefit from the optimization step before
|
||||
being offloaded. For example, a branch which has already fallen out of PITR window and has no history
|
||||
of its own may be immediately re-written as a series of image layers before being offloaded.
|
||||
|
||||
### Consumption metrics
|
||||
|
||||
Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
|
||||
that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
|
||||
vs. ordinary content.
|
||||
|
||||
Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
|
||||
variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
|
||||
|
||||
### Secondary locations
|
||||
|
||||
Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
|
||||
when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
|
||||
will be dropped from secondary locations.
|
||||
|
||||
### Sharding
|
||||
|
||||
Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
|
||||
the same way that timeline creation and deletion is done. There are no special rules about ordering:
|
||||
the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
|
||||
|
||||
Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
|
||||
will be authoritative for consumption metrics.
|
||||
|
||||
## Error cases
|
||||
|
||||
### Errors in sharded tenants
|
||||
|
||||
If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
|
||||
state, where a timeline is archived on some shards but not on others.
|
||||
|
||||
We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
|
||||
are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
|
||||
In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
|
||||
up their mistake. We rely on this good behavior of callers to eventually get timelines into a consistent
|
||||
state across all shards. If callers do leave a timeline in an inconsistent state across shards, this doesn't
|
||||
break anything, it's just "weird".
|
||||
|
||||
This is similar to the status quo for timeline creation and deletion: callers are expected to retry
|
||||
these operations until they succeed.
|
||||
|
||||
### Archiving/activating
|
||||
|
||||
Archiving/activating a timeline can fail in a limited number of ways:
|
||||
1. I/O error storing/reading the timeline's updated index
|
||||
- These errors are always retryable: a fundamental design assumption of the pageserver is that remote
|
||||
storage errors are always transient.
|
||||
2. NotFound if the timeline doesn't exist
|
||||
- Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
|
||||
- The storage controller has runtime locking to prevent races such as deleting a timeline while
|
||||
archiving it.
|
||||
3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
|
||||
- Callers are expected to do their own checks to avoid hitting this case. If they make
|
||||
a mistake and encounter this error, they should give up.
|
||||
|
||||
### Offloading
|
||||
|
||||
Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
|
||||
tenant manifest. In such error cases, we give up in the expectation that offloading will be tried
|
||||
again at the next iteration of the compaction loop.
|
||||
|
||||
### Archive branch optimization
|
||||
|
||||
Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
|
||||
can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
|
||||
the next iteration of the compaction loop.
|
||||
|
||||
## Optimizations
|
||||
|
||||
### Delaying storage optimization if retaining parent layers is cheaper
|
||||
|
||||
Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
|
||||
is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
|
||||
are offloaded to S3 they're totally safe, inert things.
|
||||
|
||||
However, in some cases it can be advantageous to retain extra history on their parent branch rather
|
||||
than flattening the archived branch. For example, if a 1TB parent branch is rather slow-changing (1GB
|
||||
of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
|
||||
for each nightly branch is inefficient compared with just keeping more history on the main branch.
|
||||
|
||||
Getting this right requires consideration of:
|
||||
- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
|
||||
write out extra image layers, then it might make more sense to just write out the image layers on
|
||||
the archived branch.
|
||||
- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
|
||||
the layer map (and index_part) bigger. There are practical limits beyond which writing an indefinitely
|
||||
large layer map can cause problems elsewhere.
|
||||
|
||||
This optimization can probably be implemented quite cheaply with some basic heuristics like:
|
||||
- don't bother doing optimization on an archive branch if the LSN distance between
|
||||
its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
|
||||
- ...but, Don't keep more history on the main branch than double the PITR
|
||||
|
||||
### Creating a timeline in archived state (a snapshot)
|
||||
|
||||
Sometimes, one might want to create a branch with no history, which will not be written to
|
||||
before it is archived. This is a snapshot, although we do not require a special snapshot API,
|
||||
since a snapshot can be represented as a timeline with no history.
|
||||
|
||||
This can be accomplished by simply creating a timeline and then immediately archiving it, but
|
||||
that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
|
||||
broker to try and ingest WAL, before being shutdown in the subsequent archival call. To explicitly
|
||||
support this common special case, we may add a parameter to the timeline creation API which
|
||||
creates a timeline directly into the archived state.
|
||||
|
||||
Such a timeline creation will do exactly two I/Os at creation time:
|
||||
- write the index_part object to record the timeline's existence
|
||||
- when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
|
||||
write the tenant manifest.
|
||||
|
||||
Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
|
||||
up the 'snapshot' branch and write out image layers.
|
||||
|
||||
## Future Work
|
||||
|
||||
### Enabling `fullbackup` dumps from archive branches
|
||||
|
||||
It would be useful to be able to export an archive branch to another system, or for use in a local
|
||||
postgres database.
|
||||
|
||||
This could be implemented as a general capability for all branches, in which case it would "just work"
|
||||
for archive branches by activating them. However, downloading all the layers in a branch just to generate
|
||||
a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
|
||||
which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
|
||||
|
||||
Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
|
||||
is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup
|
||||
stream to S3 in an intermediate format and, then having one node stitch them together).
|
||||
|
||||
### Tagging layers from archived branches
|
||||
|
||||
When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
|
||||
we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
|
||||
cheaper storage.
|
||||
|
||||
This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
|
||||
external hints on which branches are likely to be reactivated, and which branches are good candidates for
|
||||
tagging for low performance storage.
|
||||
|
||||
Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes. Other clouds' object
|
||||
stores have similar mechanisms.
|
||||
|
||||
### Storing sequences of archive branches as deltas
|
||||
|
||||
When archived branches are used as scheduled snapshots, we could store them even more efficiently
|
||||
by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
|
||||
storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
|
||||
pages). This is the kind of encoding that many backup storage systems use.
|
||||
|
||||
The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
|
||||
vs. just writing out a simple stream of the entire database. For smaller databases, writing out a full
|
||||
copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
|
||||
so the complexity tradeoff of diff-encoding it is dubious).
|
||||
|
||||
One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
|
||||
pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
|
||||
we can say: "A branch exists from Monday night. I have Monday night's data still active in the main branch,
|
||||
so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
|
||||
delta snapshot".
|
||||
|
||||
Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
|
||||
each other: perhaps this would be done by making the archive branches have child/parent relationships with
|
||||
each other, or perhaps we would permit them to remain children of their original parent, but additionally
|
||||
have a relationship with the snapshot they're encoded relative to.
|
||||
|
||||
Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
|
||||
out how frequently to write a full copy is important. This is essentially a zoomed-out version of what
|
||||
we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
|
||||
|
||||
|
||||
## FAQ/Alternatives
|
||||
|
||||
### Store all timelines in the tenant manifest
|
||||
|
||||
Rather than special-casing offloaded timelines in the offload manifest, we could store a total
|
||||
manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
|
||||
startup.
|
||||
|
||||
That would be a more invasive change (require hooking in to timeline creation), and would
|
||||
generate much more I/O to this manifest for tenants that had many branches _and_ frequent
|
||||
create/delete cycles for short lived branches. Restricting the manifest to offloaded timelines
|
||||
means that we only have to cope with the rate at which long-lived timelines are archived, rather
|
||||
than the rate at which sort lived timelines are created & destroyed.
|
||||
|
||||
### Automatically archiving/activating timelines without external API calls
|
||||
|
||||
We could implement TTL driven offload of timelines, waking them up when a page request
|
||||
arrives.
|
||||
|
||||
This has downsides:
|
||||
- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
|
||||
know which of their branches are in this state, and might get a surprise when they try
|
||||
to use such a branch.
|
||||
- Price fluctuation: if the archival of a branch is used in end user pricing, then users
|
||||
prefer clarity & consistency. Ideally a branch's storage should cost the same from the moment it
|
||||
is created, rather than having a usage-dependency storage price.
|
||||
- Complexity: enabling the page service to call up into the Tenant to activate a timeline
|
||||
would be awkward, compared with an external entry point.
|
||||
|
||||
### Make offloaded a state of Timeline
|
||||
|
||||
To reduce the operator-facing complexity of having some timelines APIs that only return
|
||||
non-offloaded timelines, we could build the offloaded state into the Timeline type.
|
||||
|
||||
`timeline.rs` is already one of the most egregiously long source files in the tree, so
|
||||
this is rejected on the basis that we need to avoid making that complexity worse.
|
||||
@@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration:
|
||||
- Use `diesel migration generate <name>` to create a new migration
|
||||
- Populate the SQL files in the `migrations/` subdirectory
|
||||
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
|
||||
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
|
||||
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
|
||||
- Commit the migration files and the changes to schema.rs
|
||||
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
|
||||
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
|
||||
|
||||
@@ -13,11 +13,7 @@ use std::{
|
||||
|
||||
use measured::{
|
||||
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
|
||||
metric::{
|
||||
group::{Encoding, MetricValue},
|
||||
name::MetricNameEncoder,
|
||||
Metric, MetricType, MetricVec,
|
||||
},
|
||||
metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
|
||||
text::TextEncoder,
|
||||
LabelGroup,
|
||||
};
|
||||
@@ -144,6 +140,7 @@ impl<const N: usize> HyperLogLogState<N> {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
|
||||
for HyperLogLogState<N>
|
||||
{
|
||||
@@ -182,12 +179,13 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.try_for_each(|(hll_shard, val)| {
|
||||
enc.write_metric_value(
|
||||
name.by_ref(),
|
||||
CounterState::new(val as u64).collect_into(
|
||||
&(),
|
||||
labels.by_ref().compose_with(HllShardLabel {
|
||||
hll_shard: hll_shard as i64,
|
||||
}),
|
||||
MetricValue::Int(val as i64),
|
||||
name.by_ref(),
|
||||
enc,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ use measured::{
|
||||
metric::{
|
||||
counter::CounterState,
|
||||
gauge::GaugeState,
|
||||
group::{Encoding, MetricValue},
|
||||
group::Encoding,
|
||||
name::{MetricName, MetricNameEncoder},
|
||||
MetricEncoding, MetricFamilyEncoding,
|
||||
},
|
||||
@@ -171,8 +171,11 @@ fn write_gauge<Enc: Encoding>(
|
||||
labels: impl LabelGroup,
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut Enc,
|
||||
) -> Result<(), Enc::Err> {
|
||||
enc.write_metric_value(name, labels, MetricValue::Int(x))
|
||||
) -> Result<(), Enc::Err>
|
||||
where
|
||||
GaugeState: MetricEncoding<Enc>,
|
||||
{
|
||||
GaugeState::new(x).collect_into(&(), labels, name, enc)
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -544,15 +547,6 @@ impl<T: Encoding> Encoding for Inc<T> {
|
||||
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
||||
self.0.write_help(name, help)
|
||||
}
|
||||
|
||||
fn write_metric_value(
|
||||
&mut self,
|
||||
name: impl MetricNameEncoder,
|
||||
labels: impl LabelGroup,
|
||||
value: MetricValue,
|
||||
) -> Result<(), Self::Err> {
|
||||
self.0.write_metric_value(name, labels, value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
|
||||
@@ -579,15 +573,6 @@ impl<T: Encoding> Encoding for Dec<T> {
|
||||
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
||||
self.0.write_help(name, help)
|
||||
}
|
||||
|
||||
fn write_metric_value(
|
||||
&mut self,
|
||||
name: impl MetricNameEncoder,
|
||||
labels: impl LabelGroup,
|
||||
value: MetricValue,
|
||||
) -> Result<(), Self::Err> {
|
||||
self.0.write_metric_value(name, labels, value)
|
||||
}
|
||||
}
|
||||
|
||||
/// Write the dec counter to the encoder
|
||||
|
||||
@@ -87,7 +87,7 @@ pub struct TenantLocateResponse {
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantDescribeResponse {
|
||||
pub tenant_id: TenantId,
|
||||
pub shards: Vec<TenantDescribeResponseShard>,
|
||||
@@ -110,7 +110,7 @@ pub struct NodeDescribeResponse {
|
||||
pub listen_pg_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantDescribeResponseShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
|
||||
|
||||
@@ -294,7 +294,6 @@ pub struct TenantConfig {
|
||||
pub walreceiver_connect_timeout: Option<String>,
|
||||
pub lagging_wal_timeout: Option<String>,
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
pub trace_read_requests: Option<bool>,
|
||||
pub eviction_policy: Option<EvictionPolicy>,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
||||
@@ -440,9 +439,6 @@ pub enum CompactionAlgorithm {
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum ImageCompressionAlgorithm {
|
||||
/// Disabled for writes, and never decompress during reading.
|
||||
/// Never set this after you've enabled compression once!
|
||||
DisabledNoDecompress,
|
||||
// Disabled for writes, support decompressing during read path
|
||||
Disabled,
|
||||
/// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
|
||||
@@ -452,12 +448,6 @@ pub enum ImageCompressionAlgorithm {
|
||||
},
|
||||
}
|
||||
|
||||
impl ImageCompressionAlgorithm {
|
||||
pub fn allow_decompression(&self) -> bool {
|
||||
!matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for ImageCompressionAlgorithm {
|
||||
type Err = anyhow::Error;
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
@@ -466,7 +456,6 @@ impl FromStr for ImageCompressionAlgorithm {
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::anyhow!("empty string"))?;
|
||||
match first {
|
||||
"disabled-no-decompress" => Ok(ImageCompressionAlgorithm::DisabledNoDecompress),
|
||||
"disabled" => Ok(ImageCompressionAlgorithm::Disabled),
|
||||
"zstd" => {
|
||||
let level = if let Some(v) = components.next() {
|
||||
@@ -662,6 +651,17 @@ pub struct TenantDetails {
|
||||
pub timelines: Vec<TimelineId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
|
||||
pub enum TimelineArchivalState {
|
||||
Archived,
|
||||
Unarchived,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
pub struct TimelineArchivalConfigRequest {
|
||||
pub state: TimelineArchivalState,
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
@@ -1683,10 +1683,6 @@ mod tests {
|
||||
ImageCompressionAlgorithm::from_str("disabled").unwrap(),
|
||||
Disabled
|
||||
);
|
||||
assert_eq!(
|
||||
ImageCompressionAlgorithm::from_str("disabled-no-decompress").unwrap(),
|
||||
DisabledNoDecompress
|
||||
);
|
||||
assert_eq!(
|
||||
ImageCompressionAlgorithm::from_str("zstd").unwrap(),
|
||||
Zstd { level: None }
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use utils::id::TimelineId;
|
||||
|
||||
#[derive(Default, serde::Serialize)]
|
||||
#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct AncestorDetached {
|
||||
pub reparented_timelines: Vec<TimelineId>,
|
||||
}
|
||||
|
||||
@@ -1,59 +1,42 @@
|
||||
use std::{ops::RangeInclusive, str::FromStr};
|
||||
//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
|
||||
//!
|
||||
//! This module contains a variety of types used to represent the concept of sharding
|
||||
//! a Neon tenant across multiple physical shards. Since there are quite a few of these,
|
||||
//! we provide an summary here.
|
||||
//!
|
||||
//! Types used to describe shards:
|
||||
//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
|
||||
//! which identifies a tenant which is not shard-aware. This means its storage paths do not include
|
||||
//! a shard suffix.
|
||||
//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
|
||||
//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
|
||||
//! without the tenant ID. This is useful for things that are implicitly scoped to a particular
|
||||
//! tenant, such as layer files.
|
||||
//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
|
||||
//! detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
|
||||
//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
|
||||
//! four hex digits. An unsharded tenant is `0000`.
|
||||
//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
|
||||
//!
|
||||
//! Types used to describe the parameters for data distribution in a sharded tenant:
|
||||
//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
|
||||
//! multiple shards. Its value is given in 8kiB pages.
|
||||
//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
|
||||
//! always zero: this is provided for future upgrades that might introduce different
|
||||
//! data distribution schemes.
|
||||
//!
|
||||
//! Examples:
|
||||
//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
|
||||
//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
|
||||
//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
||||
//! and their slugs are 0004, 0104, 0204, and 0304.
|
||||
|
||||
use crate::{key::Key, models::ShardParameters};
|
||||
use hex::FromHex;
|
||||
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TenantId;
|
||||
|
||||
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
|
||||
///
|
||||
/// This module contains a variety of types used to represent the concept of sharding
|
||||
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
|
||||
/// we provide an summary here.
|
||||
///
|
||||
/// Types used to describe shards:
|
||||
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
|
||||
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
|
||||
/// a shard suffix.
|
||||
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
|
||||
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
|
||||
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
|
||||
/// tenant, such as layer files.
|
||||
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
|
||||
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
|
||||
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
|
||||
/// four hex digits. An unsharded tenant is `0000`.
|
||||
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
|
||||
///
|
||||
/// Types used to describe the parameters for data distribution in a sharded tenant:
|
||||
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
|
||||
/// multiple shards. Its value is given in 8kiB pages.
|
||||
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
|
||||
/// always zero: this is provided for future upgrades that might introduce different
|
||||
/// data distribution schemes.
|
||||
///
|
||||
/// Examples:
|
||||
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
|
||||
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
|
||||
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
||||
/// and their slugs are 0004, 0104, 0204, and 0304.
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardNumber(pub u8);
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardCount(u8);
|
||||
|
||||
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
||||
/// when we need to know which shard we're dealing with, but do not need to know the full
|
||||
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
||||
/// the fully qualified TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
#[doc(inline)]
|
||||
pub use ::utils::shard::*;
|
||||
|
||||
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
|
||||
/// and to check whether that [`ShardNumber`] is the same as the current shard.
|
||||
@@ -65,362 +48,6 @@ pub struct ShardIdentity {
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
/// Formatting helper, for generating the `shard_id` label in traces.
|
||||
struct ShardSlug<'a>(&'a TenantShardId);
|
||||
|
||||
/// TenantShardId globally identifies a particular shard in a particular tenant.
|
||||
///
|
||||
/// These are written as `<TenantId>-<ShardSlug>`, for example:
|
||||
/// # The second shard in a two-shard tenant
|
||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||
///
|
||||
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
|
||||
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
|
||||
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
|
||||
///
|
||||
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
|
||||
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
|
||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||
/// as a TenantId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TenantShardId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
/// as [`TenantShardId::unsharded`].
|
||||
///
|
||||
/// This method returns the actual number of shards, i.e. if our internal value is
|
||||
/// zero, we return 1 (unsharded tenants have 1 shard).
|
||||
pub fn count(&self) -> u8 {
|
||||
if self.0 > 0 {
|
||||
self.0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
|
||||
/// The literal internal value: this is **not** the number of shards in the
|
||||
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
|
||||
/// [`Self::count`] if you want to know the cardinality of shards.
|
||||
pub fn literal(&self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
|
||||
/// uses the legacy format for `TenantShardId`. See also the documentation for
|
||||
/// [`Self::count`].
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.0 == 0
|
||||
}
|
||||
|
||||
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
||||
/// [`Self::literal`] would return.
|
||||
pub const fn new(val: u8) -> Self {
|
||||
Self(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardNumber {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
}
|
||||
|
||||
impl TenantShardId {
|
||||
pub fn unsharded(tenant_id: TenantId) -> Self {
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// The range of all TenantShardId that belong to a particular TenantId. This is useful when
|
||||
/// you have a BTreeMap of TenantShardId, and are querying by TenantId.
|
||||
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
||||
RangeInclusive::new(
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
},
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber::MAX,
|
||||
shard_count: ShardCount::MAX,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
|
||||
ShardSlug(self)
|
||||
}
|
||||
|
||||
/// Convenience for code that has special behavior on the 0th shard.
|
||||
pub fn is_shard_zero(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0)
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||
}
|
||||
|
||||
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
|
||||
/// is useful when logging from code that is already in a span that includes tenant ID, to
|
||||
/// keep messages reasonably terse.
|
||||
pub fn to_index(&self) -> ShardIndex {
|
||||
ShardIndex {
|
||||
shard_number: self.shard_number,
|
||||
shard_count: self.shard_count,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate the children of this TenantShardId when splitting the overall tenant into
|
||||
/// the given number of shards.
|
||||
pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
|
||||
let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
|
||||
let mut child_shards = Vec::new();
|
||||
for shard_number in 0..ShardNumber(new_shard_count.0).0 {
|
||||
// Key mapping is based on a round robin mapping of key hash modulo shard count,
|
||||
// so our child shards are the ones which the same keys would map to.
|
||||
if shard_number % effective_old_shard_count == self.shard_number.0 {
|
||||
child_shards.push(TenantShardId {
|
||||
tenant_id: self.tenant_id,
|
||||
shard_number: ShardNumber(shard_number),
|
||||
shard_count: new_shard_count,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
child_shards
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{:02x}{:02x}",
|
||||
self.0.shard_number.0, self.0.shard_count.0
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TenantShardId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.shard_count != ShardCount(0) {
|
||||
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
|
||||
} else {
|
||||
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
|
||||
// is distinct from the normal single shard case (shard count == 1).
|
||||
self.tenant_id.fmt(f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for TenantShardId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// Debug is the same as Display: the compact hex representation
|
||||
write!(f, "{}", self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for TenantShardId {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
// Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
|
||||
if s.len() == 32 {
|
||||
// Legacy case: no shard specified
|
||||
Ok(Self {
|
||||
tenant_id: TenantId::from_str(s)?,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
})
|
||||
} else if s.len() == 37 {
|
||||
let bytes = s.as_bytes();
|
||||
let tenant_id = TenantId::from_hex(&bytes[0..32])?;
|
||||
let mut shard_parts: [u8; 2] = [0u8; 2];
|
||||
hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(shard_parts[0]),
|
||||
shard_count: ShardCount(shard_parts[1]),
|
||||
})
|
||||
} else {
|
||||
Err(hex::FromHexError::InvalidStringLength)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<[u8; 18]> for TenantShardId {
|
||||
fn from(b: [u8; 18]) -> Self {
|
||||
let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
|
||||
|
||||
Self {
|
||||
tenant_id: TenantId::from(tenant_id_bytes),
|
||||
shard_number: ShardNumber(b[16]),
|
||||
shard_count: ShardCount(b[17]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardIndex {
|
||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
shard_number: number,
|
||||
shard_count: count,
|
||||
}
|
||||
}
|
||||
pub fn unsharded() -> Self {
|
||||
Self {
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
|
||||
/// For use in constructing remote storage paths: concatenate this with a TenantId
|
||||
/// to get a fully qualified TenantShardId.
|
||||
///
|
||||
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
|
||||
/// that the legacy pre-sharding remote key format is preserved.
|
||||
pub fn get_suffix(&self) -> String {
|
||||
if self.is_unsharded() {
|
||||
"".to_string()
|
||||
} else {
|
||||
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ShardIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// Debug is the same as Display: the compact hex representation
|
||||
write!(f, "{}", self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for ShardIndex {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
// Expect format: 1 byte shard number, 1 byte shard count
|
||||
if s.len() == 4 {
|
||||
let bytes = s.as_bytes();
|
||||
let mut shard_parts: [u8; 2] = [0u8; 2];
|
||||
hex::decode_to_slice(bytes, &mut shard_parts)?;
|
||||
Ok(Self {
|
||||
shard_number: ShardNumber(shard_parts[0]),
|
||||
shard_count: ShardCount(shard_parts[1]),
|
||||
})
|
||||
} else {
|
||||
Err(hex::FromHexError::InvalidStringLength)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<[u8; 2]> for ShardIndex {
|
||||
fn from(b: [u8; 2]) -> Self {
|
||||
Self {
|
||||
shard_number: ShardNumber(b[0]),
|
||||
shard_count: ShardCount(b[1]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for TenantShardId {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
// Note: while human encoding of [`TenantShardId`] is backward and forward
|
||||
// compatible, this binary encoding is not.
|
||||
let mut packed: [u8; 18] = [0; 18];
|
||||
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
||||
packed[16] = self.shard_number.0;
|
||||
packed[17] = self.shard_count.0;
|
||||
|
||||
packed.serialize(serializer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for TenantShardId {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct IdVisitor {
|
||||
is_human_readable_deserializer: bool,
|
||||
}
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
||||
type Value = TenantShardId;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
if self.is_human_readable_deserializer {
|
||||
formatter.write_str("value in form of hex string")
|
||||
} else {
|
||||
formatter.write_str("value in form of integer array([u8; 18])")
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::SeqAccess<'de>,
|
||||
{
|
||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||
let id: [u8; 18] = Deserialize::deserialize(s)?;
|
||||
Ok(TenantShardId::from(id))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
TenantShardId::from_str(v).map_err(E::custom)
|
||||
}
|
||||
}
|
||||
|
||||
if deserializer.is_human_readable() {
|
||||
deserializer.deserialize_str(IdVisitor {
|
||||
is_human_readable_deserializer: true,
|
||||
})
|
||||
} else {
|
||||
deserializer.deserialize_tuple(
|
||||
18,
|
||||
IdVisitor {
|
||||
is_human_readable_deserializer: false,
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Stripe size in number of pages
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardStripeSize(pub u32);
|
||||
@@ -585,77 +212,6 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for ShardIndex {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
// Binary encoding is not used in index_part.json, but is included in anticipation of
|
||||
// switching various structures (e.g. inter-process communication, remote metadata) to more
|
||||
// compact binary encodings in future.
|
||||
let mut packed: [u8; 2] = [0; 2];
|
||||
packed[0] = self.shard_number.0;
|
||||
packed[1] = self.shard_count.0;
|
||||
packed.serialize(serializer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for ShardIndex {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct IdVisitor {
|
||||
is_human_readable_deserializer: bool,
|
||||
}
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
||||
type Value = ShardIndex;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
if self.is_human_readable_deserializer {
|
||||
formatter.write_str("value in form of hex string")
|
||||
} else {
|
||||
formatter.write_str("value in form of integer array([u8; 2])")
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::SeqAccess<'de>,
|
||||
{
|
||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||
let id: [u8; 2] = Deserialize::deserialize(s)?;
|
||||
Ok(ShardIndex::from(id))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
ShardIndex::from_str(v).map_err(E::custom)
|
||||
}
|
||||
}
|
||||
|
||||
if deserializer.is_human_readable() {
|
||||
deserializer.deserialize_str(IdVisitor {
|
||||
is_human_readable_deserializer: true,
|
||||
})
|
||||
} else {
|
||||
deserializer.deserialize_tuple(
|
||||
2,
|
||||
IdVisitor {
|
||||
is_human_readable_deserializer: false,
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
|
||||
/// in order to be able to serve basebackup requests without peer communication).
|
||||
fn key_is_shard0(key: &Key) -> bool {
|
||||
@@ -737,7 +293,9 @@ pub fn describe(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::Hex;
|
||||
use std::str::FromStr;
|
||||
|
||||
use utils::{id::TenantId, Hex};
|
||||
|
||||
use super::*;
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ rustls.workspace = true
|
||||
serde.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -23,4 +24,4 @@ workspace_hack.workspace = true
|
||||
once_cell.workspace = true
|
||||
rustls-pemfile.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
|
||||
@@ -16,6 +16,7 @@ use std::{fmt, io};
|
||||
use std::{future::Future, str::FromStr};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
|
||||
@@ -400,21 +401,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
}
|
||||
|
||||
/// Wrapper for run_message_loop() that shuts down socket when we are done
|
||||
pub async fn run<F, S>(
|
||||
pub async fn run(
|
||||
mut self,
|
||||
handler: &mut impl Handler<IO>,
|
||||
shutdown_watcher: F,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
F: Fn() -> S + Clone,
|
||||
S: Future,
|
||||
{
|
||||
let ret = self
|
||||
.run_message_loop(handler, shutdown_watcher.clone())
|
||||
.await;
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), QueryError> {
|
||||
let ret = self.run_message_loop(handler, cancel).await;
|
||||
|
||||
tokio::select! {
|
||||
_ = shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
// do nothing; we most likely got already stopped by shutdown and will log it next.
|
||||
}
|
||||
_ = self.framed.shutdown() => {
|
||||
@@ -444,21 +439,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
}
|
||||
}
|
||||
|
||||
async fn run_message_loop<F, S>(
|
||||
async fn run_message_loop(
|
||||
&mut self,
|
||||
handler: &mut impl Handler<IO>,
|
||||
shutdown_watcher: F,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
S: Future,
|
||||
{
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), QueryError> {
|
||||
trace!("postgres backend to {:?} started", self.peer_addr);
|
||||
|
||||
tokio::select!(
|
||||
biased;
|
||||
|
||||
_ = shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received during handshake");
|
||||
return Err(QueryError::Shutdown)
|
||||
@@ -473,7 +464,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
let mut query_string = Bytes::new();
|
||||
while let Some(msg) = tokio::select!(
|
||||
biased;
|
||||
_ = shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received in run_message_loop");
|
||||
return Err(QueryError::Shutdown)
|
||||
@@ -485,7 +476,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
let result = self.process_message(handler, msg, &mut query_string).await;
|
||||
tokio::select!(
|
||||
biased;
|
||||
_ = shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received during response flush");
|
||||
|
||||
@@ -672,11 +663,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
assert!(self.state < ProtoState::Authentication);
|
||||
let have_tls = self.tls_config.is_some();
|
||||
match msg {
|
||||
FeStartupPacket::SslRequest => {
|
||||
FeStartupPacket::SslRequest { direct } => {
|
||||
debug!("SSL requested");
|
||||
|
||||
self.write_message(&BeMessage::EncryptionResponse(have_tls))
|
||||
.await?;
|
||||
if !direct {
|
||||
self.write_message(&BeMessage::EncryptionResponse(have_tls))
|
||||
.await?;
|
||||
} else if !have_tls {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"direct SSL negotiation but no TLS support"
|
||||
)));
|
||||
}
|
||||
|
||||
if have_tls {
|
||||
self.start_tls().await?;
|
||||
|
||||
@@ -3,13 +3,14 @@ use once_cell::sync::Lazy;
|
||||
use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
|
||||
use pq_proto::{BeMessage, RowDescriptor};
|
||||
use std::io::Cursor;
|
||||
use std::{future, sync::Arc};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tokio_postgres::tls::MakeTlsConnect;
|
||||
use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
|
||||
use tokio_postgres_rustls::MakeRustlsConnect;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
// generate client, server test streams
|
||||
async fn make_tcp_pair() -> (TcpStream, TcpStream) {
|
||||
@@ -50,7 +51,7 @@ async fn simple_select() {
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut handler = TestHandler {};
|
||||
pgbackend.run(&mut handler, future::pending::<()>).await
|
||||
pgbackend.run(&mut handler, &CancellationToken::new()).await
|
||||
});
|
||||
|
||||
let conf = Config::new();
|
||||
@@ -102,7 +103,7 @@ async fn simple_select_ssl() {
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut handler = TestHandler {};
|
||||
pgbackend.run(&mut handler, future::pending::<()>).await
|
||||
pgbackend.run(&mut handler, &CancellationToken::new()).await
|
||||
});
|
||||
|
||||
let client_cfg = rustls::ClientConfig::builder()
|
||||
|
||||
@@ -44,9 +44,9 @@ impl ConnectionError {
|
||||
/// Wraps async io `stream`, providing messages to write/flush + read Postgres
|
||||
/// messages.
|
||||
pub struct Framed<S> {
|
||||
stream: S,
|
||||
read_buf: BytesMut,
|
||||
write_buf: BytesMut,
|
||||
pub stream: S,
|
||||
pub read_buf: BytesMut,
|
||||
pub write_buf: BytesMut,
|
||||
}
|
||||
|
||||
impl<S> Framed<S> {
|
||||
|
||||
@@ -39,14 +39,39 @@ pub enum FeMessage {
|
||||
PasswordMessage(Bytes),
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, PartialOrd)]
|
||||
pub struct ProtocolVersion(u32);
|
||||
|
||||
impl ProtocolVersion {
|
||||
pub const fn new(major: u16, minor: u16) -> Self {
|
||||
Self((major as u32) << 16 | minor as u32)
|
||||
}
|
||||
pub const fn minor(self) -> u16 {
|
||||
self.0 as u16
|
||||
}
|
||||
pub const fn major(self) -> u16 {
|
||||
(self.0 >> 16) as u16
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for ProtocolVersion {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_list()
|
||||
.entry(&self.major())
|
||||
.entry(&self.minor())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum FeStartupPacket {
|
||||
CancelRequest(CancelKeyData),
|
||||
SslRequest,
|
||||
SslRequest {
|
||||
direct: bool,
|
||||
},
|
||||
GssEncRequest,
|
||||
StartupMessage {
|
||||
major_version: u32,
|
||||
minor_version: u32,
|
||||
version: ProtocolVersion,
|
||||
params: StartupMessageParams,
|
||||
},
|
||||
}
|
||||
@@ -301,11 +326,23 @@ impl FeStartupPacket {
|
||||
/// different from [`FeMessage::parse`] because startup messages don't have
|
||||
/// message type byte; otherwise, its comments apply.
|
||||
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
|
||||
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
|
||||
const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
|
||||
const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
|
||||
const CANCEL_REQUEST_CODE: u32 = 5678;
|
||||
const NEGOTIATE_SSL_CODE: u32 = 5679;
|
||||
const NEGOTIATE_GSS_CODE: u32 = 5680;
|
||||
const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
|
||||
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
|
||||
const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
|
||||
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
|
||||
const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
|
||||
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
|
||||
const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
|
||||
|
||||
// <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
|
||||
// First byte indicates standard SSL handshake message
|
||||
// (It can't be a Postgres startup length because in network byte order
|
||||
// that would be a startup packet hundreds of megabytes long)
|
||||
if buf.first() == Some(&0x16) {
|
||||
return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
|
||||
}
|
||||
|
||||
// need at least 4 bytes with packet len
|
||||
if buf.len() < 4 {
|
||||
@@ -338,12 +375,10 @@ impl FeStartupPacket {
|
||||
let mut msg = buf.split_to(len).freeze();
|
||||
msg.advance(4); // consume len
|
||||
|
||||
let request_code = msg.get_u32();
|
||||
let req_hi = request_code >> 16;
|
||||
let req_lo = request_code & ((1 << 16) - 1);
|
||||
let request_code = ProtocolVersion(msg.get_u32());
|
||||
// StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
|
||||
let message = match (req_hi, req_lo) {
|
||||
(RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
|
||||
let message = match request_code {
|
||||
CANCEL_REQUEST_CODE => {
|
||||
if msg.remaining() != 8 {
|
||||
return Err(ProtocolError::BadMessage(
|
||||
"CancelRequest message is malformed, backend PID / secret key missing"
|
||||
@@ -355,21 +390,22 @@ impl FeStartupPacket {
|
||||
cancel_key: msg.get_i32(),
|
||||
})
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
|
||||
NEGOTIATE_SSL_CODE => {
|
||||
// Requested upgrade to SSL (aka TLS)
|
||||
FeStartupPacket::SslRequest
|
||||
FeStartupPacket::SslRequest { direct: false }
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
|
||||
NEGOTIATE_GSS_CODE => {
|
||||
// Requested upgrade to GSSAPI
|
||||
FeStartupPacket::GssEncRequest
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
|
||||
version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
|
||||
return Err(ProtocolError::Protocol(format!(
|
||||
"Unrecognized request code {unrecognized_code}"
|
||||
"Unrecognized request code {}",
|
||||
version.minor()
|
||||
)));
|
||||
}
|
||||
// TODO bail if protocol major_version is not 3?
|
||||
(major_version, minor_version) => {
|
||||
version => {
|
||||
// StartupMessage
|
||||
|
||||
let s = str::from_utf8(&msg).map_err(|_e| {
|
||||
@@ -382,8 +418,7 @@ impl FeStartupPacket {
|
||||
})?;
|
||||
|
||||
FeStartupPacket::StartupMessage {
|
||||
major_version,
|
||||
minor_version,
|
||||
version,
|
||||
params: StartupMessageParams {
|
||||
params: msg.slice_ref(s.as_bytes()),
|
||||
},
|
||||
@@ -522,6 +557,10 @@ pub enum BeMessage<'a> {
|
||||
RowDescription(&'a [RowDescriptor<'a>]),
|
||||
XLogData(XLogDataBody<'a>),
|
||||
NoticeResponse(&'a str),
|
||||
NegotiateProtocolVersion {
|
||||
version: ProtocolVersion,
|
||||
options: &'a [&'a str],
|
||||
},
|
||||
KeepAlive(WalSndKeepAlive),
|
||||
}
|
||||
|
||||
@@ -945,6 +984,18 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_u8(u8::from(req.request_reply));
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::NegotiateProtocolVersion { version, options } => {
|
||||
buf.put_u8(b'v');
|
||||
write_body(buf, |buf| {
|
||||
buf.put_u32(version.0);
|
||||
buf.put_u32(options.len() as u32);
|
||||
for option in options.iter() {
|
||||
write_cstr(option, buf)?;
|
||||
}
|
||||
Ok(())
|
||||
})?
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -443,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
|
||||
pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
|
||||
let timeout = storage_config.timeout;
|
||||
Ok(match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs { local_path: path } => {
|
||||
@@ -458,7 +458,7 @@ impl GenericRemoteStorage {
|
||||
std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
|
||||
}
|
||||
RemoteStorageKind::AzureContainer(azure_config) => {
|
||||
let storage_account = azure_config
|
||||
|
||||
@@ -16,16 +16,10 @@ use std::{
|
||||
|
||||
use anyhow::{anyhow, Context as _};
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
meta::credentials::CredentialsProviderChain,
|
||||
profile::ProfileFileCredentialsProvider,
|
||||
provider_config::ProviderConfig,
|
||||
default_provider::credentials::DefaultCredentialsChain,
|
||||
retry::{RetryConfigBuilder, RetryMode},
|
||||
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||
BehaviorVersion,
|
||||
};
|
||||
use aws_credential_types::provider::SharedCredentialsProvider;
|
||||
use aws_sdk_s3::{
|
||||
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
@@ -76,40 +70,27 @@ struct GetObjectRequest {
|
||||
}
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
|
||||
pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
|
||||
tracing::debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
remote_storage_config.bucket_name
|
||||
);
|
||||
|
||||
let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
|
||||
let region = Region::new(remote_storage_config.bucket_region.clone());
|
||||
let region_opt = Some(region.clone());
|
||||
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
|
||||
let credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
CredentialsProviderChain::first_try(
|
||||
"env",
|
||||
EnvironmentVariableCredentialsProvider::new(),
|
||||
)
|
||||
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
|
||||
.or_else(
|
||||
"profile-sso",
|
||||
ProfileFileCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// needed to access remote extensions bucket
|
||||
.or_else(
|
||||
"token",
|
||||
WebIdentityTokenCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
// uses imds v2
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
// https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html
|
||||
// https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html
|
||||
// Incomplete list of auth methods used by this:
|
||||
// * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
// * "AWS_PROFILE" / `aws sso login --profile <profile>`
|
||||
// * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// * http (ECS/EKS) container credentials
|
||||
// * imds v2
|
||||
let credentials_provider = DefaultCredentialsChain::builder()
|
||||
.region(region)
|
||||
.build()
|
||||
.await;
|
||||
|
||||
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
|
||||
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
|
||||
@@ -118,9 +99,9 @@ impl S3Bucket {
|
||||
#[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
|
||||
BehaviorVersion::v2023_11_09(),
|
||||
)
|
||||
.region(region)
|
||||
.region(region_opt)
|
||||
.identity_cache(IdentityCache::lazy().build())
|
||||
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
|
||||
.credentials_provider(credentials_provider)
|
||||
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
|
||||
|
||||
let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
|
||||
@@ -1041,8 +1022,8 @@ mod tests {
|
||||
|
||||
use crate::{RemotePath, S3Bucket, S3Config};
|
||||
|
||||
#[test]
|
||||
fn relative_path() {
|
||||
#[tokio::test]
|
||||
async fn relative_path() {
|
||||
let all_paths = ["", "some/path", "some/path/"];
|
||||
let all_paths: Vec<RemotePath> = all_paths
|
||||
.iter()
|
||||
@@ -1085,8 +1066,9 @@ mod tests {
|
||||
max_keys_per_list_response: Some(5),
|
||||
upload_storage_class: None,
|
||||
};
|
||||
let storage =
|
||||
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
|
||||
let storage = S3Bucket::new(&config, std::time::Duration::ZERO)
|
||||
.await
|
||||
.expect("remote storage init");
|
||||
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
|
||||
let result = storage.relative_path_to_s3_object(test_path);
|
||||
let expected = expected_outputs[prefix_idx][test_path_idx];
|
||||
|
||||
@@ -31,6 +31,7 @@ struct EnabledAzure {
|
||||
impl EnabledAzure {
|
||||
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
|
||||
let client = create_azure_client(max_keys_in_list_response)
|
||||
.await
|
||||
.context("Azure client creation")
|
||||
.expect("Azure client creation failed");
|
||||
|
||||
@@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_azure_client(
|
||||
async fn create_azure_client(
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
use rand::Rng;
|
||||
@@ -221,6 +222,8 @@ fn create_azure_client(
|
||||
timeout: Duration::from_secs(120),
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
@@ -197,6 +197,7 @@ struct EnabledS3 {
|
||||
impl EnabledS3 {
|
||||
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
|
||||
let client = create_s3_client(max_keys_in_list_response)
|
||||
.await
|
||||
.context("S3 client creation")
|
||||
.expect("S3 client creation failed");
|
||||
|
||||
@@ -352,7 +353,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_client(
|
||||
async fn create_s3_client(
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
use rand::Rng;
|
||||
@@ -385,7 +386,9 @@ fn create_s3_client(
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
|
||||
@@ -33,6 +33,10 @@ pub enum Scope {
|
||||
GenerationsApi,
|
||||
// Allows access to control plane managment API and some storage controller endpoints.
|
||||
Admin,
|
||||
|
||||
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
|
||||
/// of a tenant & post scrub results.
|
||||
Scrubber,
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
|
||||
114
libs/utils/src/circuit_breaker.rs
Normal file
114
libs/utils/src/circuit_breaker.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
use std::{
|
||||
fmt::Display,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use metrics::IntCounter;
|
||||
|
||||
/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
|
||||
/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
|
||||
/// to mitigate the log spam from repeated failures.
|
||||
pub struct CircuitBreaker {
|
||||
/// An identifier that enables us to log useful errors when a circuit is broken
|
||||
name: String,
|
||||
|
||||
/// Consecutive failures since last success
|
||||
fail_count: usize,
|
||||
|
||||
/// How many consecutive failures before we break the circuit
|
||||
fail_threshold: usize,
|
||||
|
||||
/// If circuit is broken, when was it broken?
|
||||
broken_at: Option<Instant>,
|
||||
|
||||
/// If set, we will auto-reset the circuit this long after it was broken. If None, broken
|
||||
/// circuits stay broken forever, or until success() is called.
|
||||
reset_period: Option<Duration>,
|
||||
|
||||
/// If this is true, no actual circuit-breaking happens. This is for overriding a circuit breaker
|
||||
/// to permit something to keep running even if it would otherwise have tripped it.
|
||||
short_circuit: bool,
|
||||
}
|
||||
|
||||
impl CircuitBreaker {
|
||||
pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
|
||||
Self {
|
||||
name,
|
||||
fail_count: 0,
|
||||
fail_threshold,
|
||||
broken_at: None,
|
||||
reset_period,
|
||||
short_circuit: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct an unbreakable circuit breaker, for use in unit tests etc.
|
||||
pub fn short_circuit() -> Self {
|
||||
Self {
|
||||
name: String::new(),
|
||||
fail_threshold: 0,
|
||||
fail_count: 0,
|
||||
broken_at: None,
|
||||
reset_period: None,
|
||||
short_circuit: true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
|
||||
where
|
||||
E: Display,
|
||||
{
|
||||
if self.short_circuit {
|
||||
return;
|
||||
}
|
||||
|
||||
self.fail_count += 1;
|
||||
if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
|
||||
self.break_circuit(metric, error);
|
||||
}
|
||||
}
|
||||
|
||||
/// Call this after successfully executing an operation
|
||||
pub fn success(&mut self, metric: &IntCounter) {
|
||||
self.fail_count = 0;
|
||||
if let Some(broken_at) = &self.broken_at {
|
||||
tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
|
||||
humantime::format_duration(broken_at.elapsed()));
|
||||
self.broken_at = None;
|
||||
metric.inc();
|
||||
}
|
||||
}
|
||||
|
||||
/// Call this before attempting an operation, and skip the operation if we are currently broken.
|
||||
pub fn is_broken(&mut self) -> bool {
|
||||
if self.short_circuit {
|
||||
return false;
|
||||
}
|
||||
|
||||
if let Some(broken_at) = self.broken_at {
|
||||
match self.reset_period {
|
||||
Some(reset_period) if broken_at.elapsed() > reset_period => {
|
||||
self.reset_circuit();
|
||||
false
|
||||
}
|
||||
_ => true,
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
|
||||
where
|
||||
E: Display,
|
||||
{
|
||||
self.broken_at = Some(Instant::now());
|
||||
tracing::error!(breaker=%self.name, "Circuit breaker broken! Last error: {error}");
|
||||
metric.inc();
|
||||
}
|
||||
|
||||
fn reset_circuit(&mut self) {
|
||||
self.broken_at = None;
|
||||
self.fail_count = 0;
|
||||
}
|
||||
}
|
||||
@@ -52,17 +52,17 @@ struct RequestId(String);
|
||||
/// There could be other ways to implement similar functionality:
|
||||
///
|
||||
/// * procmacros placed on top of all handler methods
|
||||
/// With all the drawbacks of procmacros, brings no difference implementation-wise,
|
||||
/// and little code reduction compared to the existing approach.
|
||||
/// With all the drawbacks of procmacros, brings no difference implementation-wise,
|
||||
/// and little code reduction compared to the existing approach.
|
||||
///
|
||||
/// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
|
||||
/// implemented for [`RouterBuilder`].
|
||||
/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
|
||||
/// implemented for [`RouterBuilder`].
|
||||
/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
|
||||
///
|
||||
/// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
|
||||
/// later, in a post-response middleware.
|
||||
/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
|
||||
/// tries to achive with its `.instrument` used in the current approach.
|
||||
/// later, in a post-response middleware.
|
||||
/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
|
||||
/// tries to achive with its `.instrument` used in the current approach.
|
||||
///
|
||||
/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
|
||||
pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
|
||||
|
||||
@@ -74,6 +74,15 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
|
||||
request: &Request<Body>,
|
||||
param_name: &str,
|
||||
) -> Result<T, ApiError> {
|
||||
parse_query_param(request, param_name)?.ok_or_else(|| {
|
||||
ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
|
||||
match request.body_mut().data().await {
|
||||
Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
|
||||
|
||||
@@ -302,17 +302,6 @@ pub struct TenantId(Id);
|
||||
|
||||
id_newtype!(TenantId);
|
||||
|
||||
/// Neon Connection Id identifies long-lived connections (for example a pagestream
|
||||
/// connection with the page_service). Is used for better logging and tracing
|
||||
///
|
||||
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
|
||||
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
|
||||
/// See [`Id`] for alternative ways to serialize it.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
|
||||
pub struct ConnectionId(Id);
|
||||
|
||||
id_newtype!(ConnectionId);
|
||||
|
||||
// A pair uniquely identifying Neon instance.
|
||||
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct TenantTimelineId {
|
||||
|
||||
@@ -26,6 +26,8 @@ pub mod auth;
|
||||
// utility functions and helper traits for unified unique id generation/serialization etc.
|
||||
pub mod id;
|
||||
|
||||
pub mod shard;
|
||||
|
||||
mod hex;
|
||||
pub use hex::Hex;
|
||||
|
||||
@@ -96,6 +98,8 @@ pub mod poison;
|
||||
|
||||
pub mod toml_edit_ext;
|
||||
|
||||
pub mod circuit_breaker;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
451
libs/utils/src/shard.rs
Normal file
451
libs/utils/src/shard.rs
Normal file
@@ -0,0 +1,451 @@
|
||||
//! See `pageserver_api::shard` for description on sharding.
|
||||
|
||||
use std::{ops::RangeInclusive, str::FromStr};
|
||||
|
||||
use hex::FromHex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::id::TenantId;
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardNumber(pub u8);
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardCount(pub u8);
|
||||
|
||||
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
||||
/// when we need to know which shard we're dealing with, but do not need to know the full
|
||||
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
||||
/// the fully qualified TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
/// Formatting helper, for generating the `shard_id` label in traces.
|
||||
pub struct ShardSlug<'a>(&'a TenantShardId);
|
||||
|
||||
/// TenantShardId globally identifies a particular shard in a particular tenant.
|
||||
///
|
||||
/// These are written as `<TenantId>-<ShardSlug>`, for example:
|
||||
/// # The second shard in a two-shard tenant
|
||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||
///
|
||||
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
|
||||
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
|
||||
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
|
||||
///
|
||||
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
|
||||
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
|
||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||
/// as a TenantId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TenantShardId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
/// as [`TenantShardId::unsharded`].
|
||||
///
|
||||
/// This method returns the actual number of shards, i.e. if our internal value is
|
||||
/// zero, we return 1 (unsharded tenants have 1 shard).
|
||||
pub fn count(&self) -> u8 {
|
||||
if self.0 > 0 {
|
||||
self.0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
|
||||
/// The literal internal value: this is **not** the number of shards in the
|
||||
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
|
||||
/// [`Self::count`] if you want to know the cardinality of shards.
|
||||
pub fn literal(&self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
|
||||
/// uses the legacy format for `TenantShardId`. See also the documentation for
|
||||
/// [`Self::count`].
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.0 == 0
|
||||
}
|
||||
|
||||
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
||||
/// [`Self::literal`] would return.
|
||||
pub const fn new(val: u8) -> Self {
|
||||
Self(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardNumber {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
}
|
||||
|
||||
impl TenantShardId {
|
||||
pub fn unsharded(tenant_id: TenantId) -> Self {
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// The range of all TenantShardId that belong to a particular TenantId. This is useful when
|
||||
/// you have a BTreeMap of TenantShardId, and are querying by TenantId.
|
||||
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
||||
RangeInclusive::new(
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
},
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber::MAX,
|
||||
shard_count: ShardCount::MAX,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
|
||||
ShardSlug(self)
|
||||
}
|
||||
|
||||
/// Convenience for code that has special behavior on the 0th shard.
|
||||
pub fn is_shard_zero(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0)
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||
}
|
||||
|
||||
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
|
||||
/// is useful when logging from code that is already in a span that includes tenant ID, to
|
||||
/// keep messages reasonably terse.
|
||||
pub fn to_index(&self) -> ShardIndex {
|
||||
ShardIndex {
|
||||
shard_number: self.shard_number,
|
||||
shard_count: self.shard_count,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate the children of this TenantShardId when splitting the overall tenant into
|
||||
/// the given number of shards.
|
||||
pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
|
||||
let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
|
||||
let mut child_shards = Vec::new();
|
||||
for shard_number in 0..ShardNumber(new_shard_count.0).0 {
|
||||
// Key mapping is based on a round robin mapping of key hash modulo shard count,
|
||||
// so our child shards are the ones which the same keys would map to.
|
||||
if shard_number % effective_old_shard_count == self.shard_number.0 {
|
||||
child_shards.push(TenantShardId {
|
||||
tenant_id: self.tenant_id,
|
||||
shard_number: ShardNumber(shard_number),
|
||||
shard_count: new_shard_count,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
child_shards
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{:02x}{:02x}",
|
||||
self.0.shard_number.0, self.0.shard_count.0
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TenantShardId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.shard_count != ShardCount(0) {
|
||||
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
|
||||
} else {
|
||||
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
|
||||
// is distinct from the normal single shard case (shard count == 1).
|
||||
self.tenant_id.fmt(f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for TenantShardId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// Debug is the same as Display: the compact hex representation
|
||||
write!(f, "{}", self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for TenantShardId {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
// Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
|
||||
if s.len() == 32 {
|
||||
// Legacy case: no shard specified
|
||||
Ok(Self {
|
||||
tenant_id: TenantId::from_str(s)?,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
})
|
||||
} else if s.len() == 37 {
|
||||
let bytes = s.as_bytes();
|
||||
let tenant_id = TenantId::from_hex(&bytes[0..32])?;
|
||||
let mut shard_parts: [u8; 2] = [0u8; 2];
|
||||
hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(shard_parts[0]),
|
||||
shard_count: ShardCount(shard_parts[1]),
|
||||
})
|
||||
} else {
|
||||
Err(hex::FromHexError::InvalidStringLength)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<[u8; 18]> for TenantShardId {
|
||||
fn from(b: [u8; 18]) -> Self {
|
||||
let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
|
||||
|
||||
Self {
|
||||
tenant_id: TenantId::from(tenant_id_bytes),
|
||||
shard_number: ShardNumber(b[16]),
|
||||
shard_count: ShardCount(b[17]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardIndex {
|
||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
shard_number: number,
|
||||
shard_count: count,
|
||||
}
|
||||
}
|
||||
pub fn unsharded() -> Self {
|
||||
Self {
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
|
||||
/// For use in constructing remote storage paths: concatenate this with a TenantId
|
||||
/// to get a fully qualified TenantShardId.
|
||||
///
|
||||
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
|
||||
/// that the legacy pre-sharding remote key format is preserved.
|
||||
pub fn get_suffix(&self) -> String {
|
||||
if self.is_unsharded() {
|
||||
"".to_string()
|
||||
} else {
|
||||
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ShardIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// Debug is the same as Display: the compact hex representation
|
||||
write!(f, "{}", self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for ShardIndex {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
// Expect format: 1 byte shard number, 1 byte shard count
|
||||
if s.len() == 4 {
|
||||
let bytes = s.as_bytes();
|
||||
let mut shard_parts: [u8; 2] = [0u8; 2];
|
||||
hex::decode_to_slice(bytes, &mut shard_parts)?;
|
||||
Ok(Self {
|
||||
shard_number: ShardNumber(shard_parts[0]),
|
||||
shard_count: ShardCount(shard_parts[1]),
|
||||
})
|
||||
} else {
|
||||
Err(hex::FromHexError::InvalidStringLength)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<[u8; 2]> for ShardIndex {
|
||||
fn from(b: [u8; 2]) -> Self {
|
||||
Self {
|
||||
shard_number: ShardNumber(b[0]),
|
||||
shard_count: ShardCount(b[1]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for TenantShardId {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
// Note: while human encoding of [`TenantShardId`] is backward and forward
|
||||
// compatible, this binary encoding is not.
|
||||
let mut packed: [u8; 18] = [0; 18];
|
||||
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
||||
packed[16] = self.shard_number.0;
|
||||
packed[17] = self.shard_count.0;
|
||||
|
||||
packed.serialize(serializer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for TenantShardId {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct IdVisitor {
|
||||
is_human_readable_deserializer: bool,
|
||||
}
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
||||
type Value = TenantShardId;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
if self.is_human_readable_deserializer {
|
||||
formatter.write_str("value in form of hex string")
|
||||
} else {
|
||||
formatter.write_str("value in form of integer array([u8; 18])")
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::SeqAccess<'de>,
|
||||
{
|
||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||
let id: [u8; 18] = Deserialize::deserialize(s)?;
|
||||
Ok(TenantShardId::from(id))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
TenantShardId::from_str(v).map_err(E::custom)
|
||||
}
|
||||
}
|
||||
|
||||
if deserializer.is_human_readable() {
|
||||
deserializer.deserialize_str(IdVisitor {
|
||||
is_human_readable_deserializer: true,
|
||||
})
|
||||
} else {
|
||||
deserializer.deserialize_tuple(
|
||||
18,
|
||||
IdVisitor {
|
||||
is_human_readable_deserializer: false,
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for ShardIndex {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
// Binary encoding is not used in index_part.json, but is included in anticipation of
|
||||
// switching various structures (e.g. inter-process communication, remote metadata) to more
|
||||
// compact binary encodings in future.
|
||||
let mut packed: [u8; 2] = [0; 2];
|
||||
packed[0] = self.shard_number.0;
|
||||
packed[1] = self.shard_count.0;
|
||||
packed.serialize(serializer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for ShardIndex {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct IdVisitor {
|
||||
is_human_readable_deserializer: bool,
|
||||
}
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
||||
type Value = ShardIndex;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
if self.is_human_readable_deserializer {
|
||||
formatter.write_str("value in form of hex string")
|
||||
} else {
|
||||
formatter.write_str("value in form of integer array([u8; 2])")
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::SeqAccess<'de>,
|
||||
{
|
||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||
let id: [u8; 2] = Deserialize::deserialize(s)?;
|
||||
Ok(ShardIndex::from(id))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
ShardIndex::from_str(v).map_err(E::custom)
|
||||
}
|
||||
}
|
||||
|
||||
if deserializer.is_human_readable() {
|
||||
deserializer.deserialize_str(IdVisitor {
|
||||
is_human_readable_deserializer: true,
|
||||
})
|
||||
} else {
|
||||
deserializer.deserialize_tuple(
|
||||
2,
|
||||
IdVisitor {
|
||||
is_human_readable_deserializer: false,
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -62,6 +62,7 @@ sync_wrapper.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-epoll-uring.workspace = true
|
||||
tokio-io-timeout.workspace = true
|
||||
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
thiserror.workspace = true
|
||||
async-trait.workspace = true
|
||||
reqwest.workspace = true
|
||||
reqwest = { workspace = true, features = [ "stream" ] }
|
||||
utils.workspace = true
|
||||
serde.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use bytes::Bytes;
|
||||
use detach_ancestor::AncestorDetached;
|
||||
use pageserver_api::{models::*, shard::TenantShardId};
|
||||
use reqwest::{IntoUrl, Method, StatusCode};
|
||||
use utils::{
|
||||
@@ -9,6 +10,8 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub use reqwest::Body as ReqwestBody;
|
||||
|
||||
pub mod util;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -20,6 +23,9 @@ pub struct Client {
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("send request: {0}")]
|
||||
SendRequest(reqwest::Error),
|
||||
|
||||
#[error("receive body: {0}")]
|
||||
ReceiveBody(reqwest::Error),
|
||||
|
||||
@@ -173,19 +179,30 @@ impl Client {
|
||||
self.request(Method::GET, uri, ()).await
|
||||
}
|
||||
|
||||
fn start_request<U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
) -> reqwest::RequestBuilder {
|
||||
let req = self.client.request(method, uri);
|
||||
if let Some(value) = &self.authorization_header {
|
||||
req.header(reqwest::header::AUTHORIZATION, value)
|
||||
} else {
|
||||
req
|
||||
}
|
||||
}
|
||||
|
||||
async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
body: B,
|
||||
) -> Result<reqwest::Response> {
|
||||
let req = self.client.request(method, uri);
|
||||
let req = if let Some(value) = &self.authorization_header {
|
||||
req.header(reqwest::header::AUTHORIZATION, value)
|
||||
} else {
|
||||
req
|
||||
};
|
||||
req.json(&body).send().await.map_err(Error::ReceiveBody)
|
||||
self.start_request(method, uri)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
@@ -402,6 +419,23 @@ impl Client {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn timeline_detach_ancestor(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<AncestorDetached> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
self.request(Method::PUT, &uri, ())
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/reset",
|
||||
@@ -609,4 +643,53 @@ impl Client {
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn import_basebackup(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
base_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
basebackup_tarball: ReqwestBody,
|
||||
) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
|
||||
self.mgmt_api_endpoint,
|
||||
);
|
||||
self.start_request(Method::PUT, uri)
|
||||
.body(basebackup_tarball)
|
||||
.send()
|
||||
.await
|
||||
.map_err(Error::SendRequest)?
|
||||
.error_from_body()
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn import_wal(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
wal_tarball: ReqwestBody,
|
||||
) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
|
||||
self.mgmt_api_endpoint,
|
||||
);
|
||||
self.start_request(Method::PUT, uri)
|
||||
.body(wal_tarball)
|
||||
.send()
|
||||
.await
|
||||
.map_err(Error::SendRequest)?
|
||||
.error_from_body()
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,7 +131,7 @@ impl CompactionKey for Key {
|
||||
pub type CompactionKeySpace<K> = Vec<Range<K>>;
|
||||
|
||||
/// Functions needed from all layers.
|
||||
pub trait CompactionLayer<K: CompactionKey + ?Sized> {
|
||||
pub trait CompactionLayer<K: CompactionKey> {
|
||||
fn key_range(&self) -> &Range<K>;
|
||||
fn lsn_range(&self) -> &Range<Lsn>;
|
||||
|
||||
|
||||
@@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.get("remote_storage")
|
||||
.expect("need remote_storage");
|
||||
let config = RemoteStorageConfig::from_toml(toml_item)?;
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
|
||||
let cancel = CancellationToken::new();
|
||||
storage
|
||||
.unwrap()
|
||||
|
||||
@@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
|
||||
}
|
||||
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
|
||||
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
|
||||
(Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Pageserver auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
)),
|
||||
(Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
|
||||
Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Pageserver auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,6 +47,9 @@ use utils::{
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
const FEATURES: &[&str] = &[
|
||||
@@ -382,7 +385,7 @@ fn start_pageserver(
|
||||
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
|
||||
|
||||
// Set up remote storage client
|
||||
let remote_storage = create_remote_storage_client(conf)?;
|
||||
let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;
|
||||
|
||||
// Set up deletion queue
|
||||
let (deletion_queue, deletion_workers) = DeletionQueue::new(
|
||||
@@ -619,7 +622,6 @@ fn start_pageserver(
|
||||
metric_collection_endpoint,
|
||||
&conf.metric_collection_bucket,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
local_disk_storage,
|
||||
@@ -657,7 +659,6 @@ fn start_pageserver(
|
||||
async move {
|
||||
page_service::libpq_listener_main(
|
||||
tenant_manager,
|
||||
broker_client,
|
||||
pg_auth,
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
@@ -700,7 +701,7 @@ fn start_pageserver(
|
||||
}
|
||||
}
|
||||
|
||||
fn create_remote_storage_client(
|
||||
async fn create_remote_storage_client(
|
||||
conf: &'static PageServerConf,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
let config = if let Some(config) = &conf.remote_storage_config {
|
||||
@@ -710,7 +711,7 @@ fn create_remote_storage_client(
|
||||
};
|
||||
|
||||
// Create the client
|
||||
let mut remote_storage = GenericRemoteStorage::from_config(config)?;
|
||||
let mut remote_storage = GenericRemoteStorage::from_config(config).await?;
|
||||
|
||||
// If `test_remote_failures` is non-zero, wrap the client with a
|
||||
// wrapper that simulates failures.
|
||||
|
||||
@@ -12,7 +12,6 @@ use serde::de::IntoDeserializer;
|
||||
use std::env;
|
||||
use storage_broker::Uri;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
use utils::logging::SecretString;
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
@@ -69,7 +68,6 @@ pub mod defaults {
|
||||
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
|
||||
|
||||
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
|
||||
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
||||
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
||||
@@ -92,7 +90,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::DisabledNoDecompress;
|
||||
ImageCompressionAlgorithm::Disabled;
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||
|
||||
@@ -124,7 +122,6 @@ pub mod defaults {
|
||||
#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
|
||||
|
||||
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
|
||||
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
|
||||
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
|
||||
|
||||
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
|
||||
@@ -239,7 +236,6 @@ pub struct PageServerConf {
|
||||
// How often to collect metrics and send them to the metrics endpoint.
|
||||
pub metric_collection_interval: Duration,
|
||||
// How often to send unchanged cached metrics to the metrics endpoint.
|
||||
pub cached_metric_collection_interval: Duration,
|
||||
pub metric_collection_endpoint: Option<Url>,
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
@@ -371,7 +367,6 @@ struct PageServerConfigBuilder {
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
|
||||
|
||||
metric_collection_interval: BuilderValue<Duration>,
|
||||
cached_metric_collection_interval: BuilderValue<Duration>,
|
||||
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
||||
synthetic_size_calculation_interval: BuilderValue<Duration>,
|
||||
metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
|
||||
@@ -455,10 +450,6 @@ impl PageServerConfigBuilder {
|
||||
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default metric collection interval")),
|
||||
cached_metric_collection_interval: Set(humantime::parse_duration(
|
||||
DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default cached_metric_collection_interval")),
|
||||
synthetic_size_calculation_interval: Set(humantime::parse_duration(
|
||||
DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
|
||||
)
|
||||
@@ -590,14 +581,6 @@ impl PageServerConfigBuilder {
|
||||
self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
|
||||
}
|
||||
|
||||
pub fn cached_metric_collection_interval(
|
||||
&mut self,
|
||||
cached_metric_collection_interval: Duration,
|
||||
) {
|
||||
self.cached_metric_collection_interval =
|
||||
BuilderValue::Set(cached_metric_collection_interval)
|
||||
}
|
||||
|
||||
pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
|
||||
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
|
||||
}
|
||||
@@ -731,7 +714,6 @@ impl PageServerConfigBuilder {
|
||||
broker_keepalive_interval,
|
||||
log_format,
|
||||
metric_collection_interval,
|
||||
cached_metric_collection_interval,
|
||||
metric_collection_endpoint,
|
||||
metric_collection_bucket,
|
||||
synthetic_size_calculation_interval,
|
||||
@@ -870,22 +852,6 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> Utf8PathBuf {
|
||||
self.workdir.join("traces")
|
||||
}
|
||||
|
||||
pub fn trace_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
connection_id: &ConnectionId,
|
||||
) -> Utf8PathBuf {
|
||||
self.traces_path()
|
||||
.join(tenant_shard_id.to_string())
|
||||
.join(timeline_id.to_string())
|
||||
.join(connection_id.to_string())
|
||||
}
|
||||
|
||||
/// Turns storage remote path of a file into its local path.
|
||||
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
|
||||
remote_path.with_base(&self.workdir)
|
||||
@@ -964,7 +930,6 @@ impl PageServerConf {
|
||||
NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
|
||||
}),
|
||||
"metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
|
||||
"cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
|
||||
"metric_collection_endpoint" => {
|
||||
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
|
||||
builder.metric_collection_endpoint(Some(endpoint));
|
||||
@@ -1097,7 +1062,6 @@ impl PageServerConf {
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
|
||||
),
|
||||
metric_collection_interval: Duration::from_secs(60),
|
||||
cached_metric_collection_interval: Duration::from_secs(60 * 60),
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
||||
@@ -1276,7 +1240,6 @@ initial_superuser_name = 'zzzz'
|
||||
id = 10
|
||||
|
||||
metric_collection_interval = '222 s'
|
||||
cached_metric_collection_interval = '22200 s'
|
||||
metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||
synthetic_size_calculation_interval = '333 s'
|
||||
|
||||
@@ -1332,9 +1295,6 @@ background_task_maximum_delay = '334 s'
|
||||
metric_collection_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
|
||||
)?,
|
||||
cached_metric_collection_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
|
||||
)?,
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: humantime::parse_duration(
|
||||
@@ -1413,7 +1373,6 @@ background_task_maximum_delay = '334 s'
|
||||
eviction_task_immitated_concurrent_logical_size_queries:
|
||||
ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: Duration::from_secs(222),
|
||||
cached_metric_collection_interval: Duration::from_secs(22200),
|
||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(333),
|
||||
@@ -1560,34 +1519,6 @@ broker_endpoint = '{broker_endpoint}'
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_tenant_config() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
let trace_read_requests = true;
|
||||
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
pg_distrib_dir='{pg_distrib_dir}'
|
||||
broker_endpoint = '{broker_endpoint}'
|
||||
|
||||
[tenant_config]
|
||||
trace_read_requests = {trace_read_requests}"#,
|
||||
);
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
|
||||
assert_eq!(
|
||||
conf.default_tenant_conf.trace_read_requests, trace_read_requests,
|
||||
"Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
|
||||
let config_string = r#"
|
||||
|
||||
@@ -46,19 +46,12 @@ pub async fn collect_metrics(
|
||||
metric_collection_endpoint: &Url,
|
||||
metric_collection_bucket: &Option<RemoteStorageConfig>,
|
||||
metric_collection_interval: Duration,
|
||||
_cached_metric_collection_interval: Duration,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
node_id: NodeId,
|
||||
local_disk_storage: Utf8PathBuf,
|
||||
cancel: CancellationToken,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
if _cached_metric_collection_interval != Duration::ZERO {
|
||||
tracing::warn!(
|
||||
"cached_metric_collection_interval is no longer used, please set it to zero."
|
||||
)
|
||||
}
|
||||
|
||||
// spin up background worker that caclulates tenant sizes
|
||||
let worker_ctx =
|
||||
ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
|
||||
@@ -103,7 +96,7 @@ pub async fn collect_metrics(
|
||||
.expect("Failed to create http client with timeout");
|
||||
|
||||
let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
|
||||
match GenericRemoteStorage::from_config(bucket_config) {
|
||||
match GenericRemoteStorage::from_config(bucket_config).await {
|
||||
Ok(client) => Some(client),
|
||||
Err(e) => {
|
||||
// Non-fatal error: if we were given an invalid config, we will proceed
|
||||
|
||||
@@ -59,6 +59,7 @@
|
||||
//! 1. It should be easy to forward the context to callees.
|
||||
//! 2. To propagate more data from high-level to low-level code, the functions in
|
||||
//! the middle should not need to be modified.
|
||||
//!
|
||||
//! The solution is to have a container structure ([`RequestContext`]) that
|
||||
//! carries the information. Functions that don't care about what's in it
|
||||
//! pass it along to callees.
|
||||
|
||||
@@ -828,9 +828,9 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
|
||||
async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
|
||||
let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let harness = TenantHarness::create(test_name).await?;
|
||||
|
||||
// We do not load() the harness: we only need its config and remote_storage
|
||||
|
||||
@@ -844,7 +844,9 @@ mod test {
|
||||
},
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mock_control_plane = MockControlPlane::new();
|
||||
|
||||
@@ -922,7 +924,9 @@ mod test {
|
||||
#[tokio::test]
|
||||
async fn deletion_queue_smoke() -> anyhow::Result<()> {
|
||||
// Basic test that the deletion queue processes the deletions we pass into it
|
||||
let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
|
||||
let ctx = setup("deletion_queue_smoke")
|
||||
.await
|
||||
.expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
client.recover(HashMap::new())?;
|
||||
|
||||
@@ -992,7 +996,9 @@ mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn deletion_queue_validation() -> anyhow::Result<()> {
|
||||
let ctx = setup("deletion_queue_validation").expect("Failed test setup");
|
||||
let ctx = setup("deletion_queue_validation")
|
||||
.await
|
||||
.expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
client.recover(HashMap::new())?;
|
||||
|
||||
@@ -1051,7 +1057,9 @@ mod test {
|
||||
#[tokio::test]
|
||||
async fn deletion_queue_recovery() -> anyhow::Result<()> {
|
||||
// Basic test that the deletion queue processes the deletions we pass into it
|
||||
let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
|
||||
let mut ctx = setup("deletion_queue_recovery")
|
||||
.await
|
||||
.expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
client.recover(HashMap::new())?;
|
||||
|
||||
|
||||
@@ -377,7 +377,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
|
||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
@@ -397,6 +397,51 @@ paths:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
put:
|
||||
description: |
|
||||
Either archives or unarchives the given timeline.
|
||||
An archived timeline may not have any non-archived children.
|
||||
requestBody:
|
||||
required: false
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ArchivalConfigRequest"
|
||||
responses:
|
||||
"200":
|
||||
description: Timeline (un)archived successfully
|
||||
"409":
|
||||
description: |
|
||||
The tenant/timeline is already being modified, perhaps by a concurrent call to this API
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/synthetic_size:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -429,7 +474,9 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SyntheticSizeResponse"
|
||||
text/html:
|
||||
description: SVG representation of the tenant and it's timelines.
|
||||
schema:
|
||||
type: string
|
||||
description: SVG representation of the tenant and its timelines.
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
@@ -568,7 +615,7 @@ paths:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
ŕequired: true
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
|
||||
@@ -774,15 +821,13 @@ components:
|
||||
TenantCreateRequest:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/TenantConfig'
|
||||
- $ref: '#/components/schemas/TenantLoadRequest'
|
||||
- type: object
|
||||
required:
|
||||
- new_tenant_id
|
||||
properties:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
generation:
|
||||
type: integer
|
||||
description: Attachment generation number.
|
||||
TenantLoadRequest:
|
||||
type: object
|
||||
properties:
|
||||
@@ -846,6 +891,15 @@ components:
|
||||
warm:
|
||||
type: boolean
|
||||
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
|
||||
ArchivalConfigRequest:
|
||||
type: object
|
||||
required
|
||||
- state
|
||||
properties:
|
||||
state:
|
||||
description: The archival state of a timeline
|
||||
type: string
|
||||
enum: ["Archived", "Unarchived"]
|
||||
TenantConfig:
|
||||
type: object
|
||||
properties:
|
||||
@@ -873,8 +927,6 @@ components:
|
||||
type: string
|
||||
max_lsn_wal_lag:
|
||||
type: integer
|
||||
trace_read_requests:
|
||||
type: boolean
|
||||
heatmap_period:
|
||||
type: string
|
||||
TenantConfigResponse:
|
||||
@@ -1108,7 +1160,7 @@ components:
|
||||
reparented_timelines:
|
||||
type: array
|
||||
description: Set of reparented timeline ids
|
||||
properties:
|
||||
items:
|
||||
type: string
|
||||
format: hex
|
||||
description: TimelineId
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use enumset::EnumSet;
|
||||
use futures::StreamExt;
|
||||
use futures::TryFutureExt;
|
||||
use humantime::format_rfc3339;
|
||||
use hyper::header;
|
||||
@@ -17,14 +18,17 @@ use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
||||
use pageserver_api::models::IngestAuxFilesRequest;
|
||||
use pageserver_api::models::ListAuxFilesRequest;
|
||||
use pageserver_api::models::LocationConfig;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::LsnLeaseRequest;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantLocationConfigRequest;
|
||||
use pageserver_api::models::TenantLocationConfigResponse;
|
||||
use pageserver_api::models::TenantScanRemoteStorageResponse;
|
||||
use pageserver_api::models::TenantScanRemoteStorageShard;
|
||||
@@ -32,24 +36,24 @@ use pageserver_api::models::TenantShardLocation;
|
||||
use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::TenantSorting;
|
||||
use pageserver_api::models::TimelineArchivalConfigRequest;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::TopTenantShardsRequest;
|
||||
use pageserver_api::models::TopTenantShardsResponse;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
|
||||
};
|
||||
use pageserver_api::shard::ShardCount;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeTravelError;
|
||||
use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::prometheus_metrics_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::request::must_parse_query_param;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
@@ -661,6 +665,39 @@ async fn timeline_preserve_initdb_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_archival_config_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
tenant
|
||||
.apply_timeline_archival_config(timeline_id, request_data.state)
|
||||
.await
|
||||
.context("applying archival config")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
Ok::<_, ApiError>(())
|
||||
}
|
||||
.instrument(info_span!("timeline_archival_config",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
state = ?request_data.state,
|
||||
%timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_detail_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1718,7 +1755,9 @@ async fn timeline_detach_ancestor_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
use crate::tenant::timeline::detach_ancestor::Options;
|
||||
use crate::tenant::timeline::detach_ancestor;
|
||||
use pageserver_api::models::detach_ancestor::AncestorDetached;
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -1726,7 +1765,7 @@ async fn timeline_detach_ancestor_handler(
|
||||
let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
|
||||
|
||||
async move {
|
||||
let mut options = Options::default();
|
||||
let mut options = detach_ancestor::Options::default();
|
||||
|
||||
let rewrite_concurrency =
|
||||
parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
|
||||
@@ -1754,27 +1793,36 @@ async fn timeline_detach_ancestor_handler(
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let (_guard, prepared) = timeline
|
||||
let progress = timeline
|
||||
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
|
||||
.await?;
|
||||
|
||||
let res = state
|
||||
.tenant_manager
|
||||
.complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
|
||||
.await;
|
||||
// uncomment to allow early as possible Tenant::drop
|
||||
// drop(tenant);
|
||||
|
||||
match res {
|
||||
Ok(reparented_timelines) => {
|
||||
let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
|
||||
let resp = match progress {
|
||||
detach_ancestor::Progress::Prepared(_guard, prepared) => {
|
||||
// it would be great to tag the guard on to the tenant activation future
|
||||
let reparented_timelines = state
|
||||
.tenant_manager
|
||||
.complete_detaching_timeline_ancestor(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
prepared,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.context("timeline detach ancestor completion")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
AncestorDetached {
|
||||
reparented_timelines,
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
}
|
||||
Err(e) => Err(ApiError::InternalServerError(
|
||||
e.context("timeline detach completion"),
|
||||
)),
|
||||
}
|
||||
detach_ancestor::Progress::Done(resp) => resp,
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
@@ -2404,6 +2452,189 @@ async fn post_top_tenants(
|
||||
)
|
||||
}
|
||||
|
||||
async fn put_tenant_timeline_import_basebackup(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
|
||||
let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
|
||||
let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
|
||||
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
|
||||
async move {
|
||||
let state = get_state(&request);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
|
||||
|
||||
let broker_client = state.broker_client.clone();
|
||||
|
||||
let mut body = StreamReader::new(request.into_body().map(|res| {
|
||||
res.map_err(|error| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
|
||||
})
|
||||
}));
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timeline = tenant
|
||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
||||
.map_err(ApiError::InternalServerError)
|
||||
.await?;
|
||||
|
||||
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||
// We might have some wal to import as well, and we should prevent compute
|
||||
// from connecting before that and writing conflicting wal.
|
||||
//
|
||||
// This is not relevant for pageserver->pageserver migrations, since there's
|
||||
// no wal to import. But should be fixed if we want to import from postgres.
|
||||
|
||||
// TODO leave clean state on error. For now you can use detach to clean
|
||||
// up broken state from a failed import.
|
||||
|
||||
// Import basebackup provided via CopyData
|
||||
info!("importing basebackup");
|
||||
|
||||
timeline
|
||||
.import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// Read the end of the tar archive.
|
||||
read_tar_eof(body)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// TODO check checksum
|
||||
// Meanwhile you can verify client-side by taking fullbackup
|
||||
// and checking that it matches in size with what was imported.
|
||||
// It wouldn't work if base came from vanilla postgres though,
|
||||
// since we discard some log files.
|
||||
|
||||
info!("done");
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn put_tenant_timeline_import_wal(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
|
||||
let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
|
||||
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
|
||||
async move {
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
|
||||
|
||||
let mut body = StreamReader::new(request.into_body().map(|res| {
|
||||
res.map_err(|error| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
|
||||
})
|
||||
}));
|
||||
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if last_record_lsn != start_lsn {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
|
||||
}
|
||||
|
||||
// TODO leave clean state on error. For now you can use detach to clean
|
||||
// up broken state from a failed import.
|
||||
|
||||
// Import wal provided via CopyData
|
||||
info!("importing wal");
|
||||
crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
|
||||
info!("wal import complete");
|
||||
|
||||
// Read the end of the tar archive.
|
||||
read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// TODO Does it make sense to overshoot?
|
||||
if timeline.get_last_record_lsn() < end_lsn {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
|
||||
}
|
||||
|
||||
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
|
||||
// We only want to persist the data, and it doesn't matter if it's in the
|
||||
// shape of deltas or images.
|
||||
info!("flushing layers");
|
||||
timeline.freeze_and_flush().await.map_err(|e| match e {
|
||||
tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
|
||||
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
|
||||
info!("done");
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}.instrument(span).await
|
||||
}
|
||||
|
||||
/// Read the end of a tar archive.
|
||||
///
|
||||
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
|
||||
/// `tokio_tar` already read the first such block. Read the second all-zeros block,
|
||||
/// and check that there is no more data after the EOF marker.
|
||||
///
|
||||
/// 'tar' command can also write extra blocks of zeros, up to a record
|
||||
/// size, controlled by the --record-size argument. Ignore them too.
|
||||
async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
|
||||
use tokio::io::AsyncReadExt;
|
||||
let mut buf = [0u8; 512];
|
||||
|
||||
// Read the all-zeros block, and verify it
|
||||
let mut total_bytes = 0;
|
||||
while total_bytes < 512 {
|
||||
let nbytes = reader.read(&mut buf[total_bytes..]).await?;
|
||||
total_bytes += nbytes;
|
||||
if nbytes == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if total_bytes < 512 {
|
||||
anyhow::bail!("incomplete or invalid tar EOF marker");
|
||||
}
|
||||
if !buf.iter().all(|&x| x == 0) {
|
||||
anyhow::bail!("invalid tar EOF marker");
|
||||
}
|
||||
|
||||
// Drain any extra zero-blocks after the EOF marker
|
||||
let mut trailing_bytes = 0;
|
||||
let mut seen_nonzero_bytes = false;
|
||||
loop {
|
||||
let nbytes = reader.read(&mut buf).await?;
|
||||
trailing_bytes += nbytes;
|
||||
if !buf.iter().all(|&x| x == 0) {
|
||||
seen_nonzero_bytes = true;
|
||||
}
|
||||
if nbytes == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if seen_nonzero_bytes {
|
||||
anyhow::bail!("unexpected non-zero bytes after the tar archive");
|
||||
}
|
||||
if trailing_bytes % 512 != 0 {
|
||||
anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Common functionality of all the HTTP API handlers.
|
||||
///
|
||||
/// - Adds a tracing span to each request (by `request_span`)
|
||||
@@ -2592,6 +2823,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|
||||
|r| api_handler(r, timeline_preserve_initdb_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
|
||||
|r| api_handler(r, timeline_archival_config_handler),
|
||||
)
|
||||
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
||||
api_handler(r, timeline_detail_handler)
|
||||
})
|
||||
@@ -2698,5 +2933,13 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
|
||||
|r| testing_api_handler("perf_info", r, perf_info),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
|
||||
|r| api_handler(r, put_tenant_timeline_import_basebackup),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
|
||||
|r| api_handler(r, put_tenant_timeline_import_wal),
|
||||
)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -23,7 +23,6 @@ pub mod span;
|
||||
pub(crate) mod statvfs;
|
||||
pub mod task_mgr;
|
||||
pub mod tenant;
|
||||
pub mod trace;
|
||||
pub mod utilization;
|
||||
pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
|
||||
@@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum MetricLayerKind {
|
||||
Delta,
|
||||
Image,
|
||||
}
|
||||
|
||||
static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_layer_bytes",
|
||||
"Sum of layer physical sizes in bytes",
|
||||
&["tenant_id", "shard_id", "timeline_id", "kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_layer_count",
|
||||
"Number of layers that exist",
|
||||
&["tenant_id", "shard_id", "timeline_id", "kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_archive_size",
|
||||
@@ -500,15 +525,6 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_visible_physical_size",
|
||||
"The size of the layer files present in the pageserver's filesystem.",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_resident_physical_size_global",
|
||||
@@ -578,6 +594,38 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_circuit_breaker_broken",
|
||||
"How many times a circuit breaker has broken"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_circuit_breaker_unbroken",
|
||||
"How many times a circuit breaker has been un-broken (recovered)"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_compression_image_in_bytes_total",
|
||||
"Size of uncompressed data written into image layers"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_compression_image_out_bytes_total",
|
||||
"Size of compressed image layer written"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) mod initial_logical_size {
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -1482,10 +1530,7 @@ pub(crate) enum ComputeCommandKind {
|
||||
PageStream,
|
||||
Basebackup,
|
||||
Fullbackup,
|
||||
ImportBasebackup,
|
||||
ImportWal,
|
||||
LeaseLsn,
|
||||
Show,
|
||||
}
|
||||
|
||||
pub(crate) struct ComputeCommandCounters {
|
||||
@@ -2137,9 +2182,12 @@ pub(crate) struct TimelineMetrics {
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub pitr_history_size: UIntGauge,
|
||||
pub archival_size: UIntGauge,
|
||||
pub(crate) layer_size_image: UIntGauge,
|
||||
pub(crate) layer_count_image: UIntGauge,
|
||||
pub(crate) layer_size_delta: UIntGauge,
|
||||
pub(crate) layer_count_delta: UIntGauge,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
pub visible_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub aux_file_size_gauge: IntGauge,
|
||||
@@ -2220,15 +2268,48 @@ impl TimelineMetrics {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let layer_size_image = TIMELINE_LAYER_SIZE
|
||||
.get_metric_with_label_values(&[
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
MetricLayerKind::Image.into(),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
let layer_count_image = TIMELINE_LAYER_COUNT
|
||||
.get_metric_with_label_values(&[
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
MetricLayerKind::Image.into(),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
let layer_size_delta = TIMELINE_LAYER_SIZE
|
||||
.get_metric_with_label_values(&[
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
MetricLayerKind::Delta.into(),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
let layer_count_delta = TIMELINE_LAYER_COUNT
|
||||
.get_metric_with_label_values(&[
|
||||
&tenant_id,
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
MetricLayerKind::Delta.into(),
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
let standby_horizon_gauge = STANDBY_HORIZON
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
// TODO: we shouldn't expose this metric
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
@@ -2277,9 +2358,12 @@ impl TimelineMetrics {
|
||||
last_record_gauge,
|
||||
pitr_history_size,
|
||||
archival_size,
|
||||
layer_size_image,
|
||||
layer_count_image,
|
||||
layer_size_delta,
|
||||
layer_count_delta,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
visible_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
aux_file_size_gauge,
|
||||
directory_entries_count_gauge,
|
||||
@@ -2331,7 +2415,6 @@ impl TimelineMetrics {
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
@@ -2340,6 +2423,31 @@ impl TimelineMetrics {
|
||||
let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
MetricLayerKind::Image.into(),
|
||||
]);
|
||||
let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
MetricLayerKind::Image.into(),
|
||||
]);
|
||||
let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
MetricLayerKind::Delta.into(),
|
||||
]);
|
||||
let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
MetricLayerKind::Delta.into(),
|
||||
]);
|
||||
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
@@ -4,9 +4,7 @@
|
||||
use anyhow::Context;
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Buf;
|
||||
use bytes::Bytes;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::Stream;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::TenantState;
|
||||
@@ -28,7 +26,6 @@ use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
use std::pin::pin;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
@@ -37,10 +34,8 @@ use std::time::Instant;
|
||||
use std::time::SystemTime;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
use utils::sync::gate::GateGuard;
|
||||
use utils::{
|
||||
auth::{Claims, Scope, SwappableJwtAuth},
|
||||
@@ -53,7 +48,6 @@ use crate::auth::check_permission;
|
||||
use crate::basebackup;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
@@ -66,13 +60,11 @@ use crate::tenant::mgr::GetTenantError;
|
||||
use crate::tenant::mgr::ShardResolveResult;
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::mgr::TenantManager;
|
||||
use crate::tenant::timeline::FlushLayerError;
|
||||
use crate::tenant::timeline::WaitLsnError;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Tenant;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
@@ -82,56 +74,6 @@ use postgres_ffi::BLCKSZ;
|
||||
// is not yet in state [`TenantState::Active`].
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
|
||||
/// Read the end of a tar archive.
|
||||
///
|
||||
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
|
||||
/// `tokio_tar` already read the first such block. Read the second all-zeros block,
|
||||
/// and check that there is no more data after the EOF marker.
|
||||
///
|
||||
/// 'tar' command can also write extra blocks of zeros, up to a record
|
||||
/// size, controlled by the --record-size argument. Ignore them too.
|
||||
async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
|
||||
use tokio::io::AsyncReadExt;
|
||||
let mut buf = [0u8; 512];
|
||||
|
||||
// Read the all-zeros block, and verify it
|
||||
let mut total_bytes = 0;
|
||||
while total_bytes < 512 {
|
||||
let nbytes = reader.read(&mut buf[total_bytes..]).await?;
|
||||
total_bytes += nbytes;
|
||||
if nbytes == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if total_bytes < 512 {
|
||||
anyhow::bail!("incomplete or invalid tar EOF marker");
|
||||
}
|
||||
if !buf.iter().all(|&x| x == 0) {
|
||||
anyhow::bail!("invalid tar EOF marker");
|
||||
}
|
||||
|
||||
// Drain any extra zero-blocks after the EOF marker
|
||||
let mut trailing_bytes = 0;
|
||||
let mut seen_nonzero_bytes = false;
|
||||
loop {
|
||||
let nbytes = reader.read(&mut buf).await?;
|
||||
trailing_bytes += nbytes;
|
||||
if !buf.iter().all(|&x| x == 0) {
|
||||
seen_nonzero_bytes = true;
|
||||
}
|
||||
if nbytes == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if seen_nonzero_bytes {
|
||||
anyhow::bail!("unexpected non-zero bytes after the tar archive");
|
||||
}
|
||||
if trailing_bytes % 512 != 0 {
|
||||
anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
///
|
||||
@@ -141,7 +83,6 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
|
||||
///
|
||||
pub async fn libpq_listener_main(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
listener: TcpListener,
|
||||
auth_type: AuthType,
|
||||
@@ -186,7 +127,6 @@ pub async fn libpq_listener_main(
|
||||
false,
|
||||
page_service_conn_main(
|
||||
tenant_manager.clone(),
|
||||
broker_client.clone(),
|
||||
local_auth,
|
||||
socket,
|
||||
auth_type,
|
||||
@@ -209,7 +149,6 @@ pub async fn libpq_listener_main(
|
||||
#[instrument(skip_all, fields(peer_addr))]
|
||||
async fn page_service_conn_main(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
@@ -262,12 +201,11 @@ async fn page_service_conn_main(
|
||||
// and create a child per-query context when it invokes process_query.
|
||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||
// and create the per-query context in process_query ourselves.
|
||||
let mut conn_handler =
|
||||
PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
|
||||
let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
|
||||
match pgbackend
|
||||
.run(&mut conn_handler, task_mgr::shutdown_watcher)
|
||||
.run(&mut conn_handler, &task_mgr::shutdown_token())
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
@@ -294,7 +232,6 @@ struct HandlerTimeline {
|
||||
}
|
||||
|
||||
struct PageServerHandler {
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
claims: Option<Claims>,
|
||||
|
||||
@@ -386,13 +323,11 @@ impl From<WaitLsnError> for QueryError {
|
||||
impl PageServerHandler {
|
||||
pub fn new(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
connection_ctx: RequestContext,
|
||||
) -> Self {
|
||||
PageServerHandler {
|
||||
tenant_manager,
|
||||
broker_client,
|
||||
auth,
|
||||
claims: None,
|
||||
connection_ctx,
|
||||
@@ -475,73 +410,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
|
||||
fn copyin_stream<'a, IO>(
|
||||
&'a self,
|
||||
pgb: &'a mut PostgresBackend<IO>,
|
||||
cancel: &'a CancellationToken,
|
||||
) -> impl Stream<Item = io::Result<Bytes>> + 'a
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
async_stream::try_stream! {
|
||||
loop {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
let msg = "pageserver is shutting down";
|
||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
||||
Err(QueryError::Shutdown)
|
||||
}
|
||||
|
||||
msg = pgb.read_message() => { msg.map_err(QueryError::from)}
|
||||
};
|
||||
|
||||
match msg {
|
||||
Ok(Some(message)) => {
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
FeMessage::CopyDone => { break },
|
||||
FeMessage::Sync => continue,
|
||||
FeMessage::Terminate => {
|
||||
let msg = "client terminated connection with Terminate message during COPY";
|
||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
break;
|
||||
}
|
||||
m => {
|
||||
let msg = format!("unexpected message {m:?}");
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg))?;
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
yield copy_data_bytes;
|
||||
}
|
||||
Ok(None) => {
|
||||
let msg = "client closed connection during COPY";
|
||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||
self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
}
|
||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
||||
Err(io_error)?;
|
||||
}
|
||||
Err(other) => {
|
||||
Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn handle_pagerequests<IO>(
|
||||
&mut self,
|
||||
@@ -560,18 +428,6 @@ impl PageServerHandler {
|
||||
.get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
|
||||
.await?;
|
||||
|
||||
// Make request tracer if needed
|
||||
let mut tracer = if tenant.get_trace_read_requests() {
|
||||
let connection_id = ConnectionId::generate();
|
||||
let path =
|
||||
tenant
|
||||
.conf
|
||||
.trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
|
||||
Some(Tracer::new(path))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// switch client to COPYBOTH
|
||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||
self.flush_cancellable(pgb, &tenant.cancel).await?;
|
||||
@@ -603,11 +459,6 @@ impl PageServerHandler {
|
||||
trace!("query: {copy_data_bytes:?}");
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
|
||||
// Trace request if needed
|
||||
if let Some(t) = tracer.as_mut() {
|
||||
t.trace(©_data_bytes)
|
||||
}
|
||||
|
||||
let neon_fe_msg =
|
||||
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
||||
|
||||
@@ -713,128 +564,6 @@ impl PageServerHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
|
||||
async fn handle_import_basebackup<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
base_lsn: Lsn,
|
||||
_end_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
let tenant = self
|
||||
.get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
|
||||
.await?;
|
||||
let timeline = tenant
|
||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
||||
.await?;
|
||||
|
||||
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||
// We might have some wal to import as well, and we should prevent compute
|
||||
// from connecting before that and writing conflicting wal.
|
||||
//
|
||||
// This is not relevant for pageserver->pageserver migrations, since there's
|
||||
// no wal to import. But should be fixed if we want to import from postgres.
|
||||
|
||||
// TODO leave clean state on error. For now you can use detach to clean
|
||||
// up broken state from a failed import.
|
||||
|
||||
// Import basebackup provided via CopyData
|
||||
info!("importing basebackup");
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
self.flush_cancellable(pgb, &tenant.cancel).await?;
|
||||
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
||||
timeline
|
||||
.import_basebackup_from_tar(
|
||||
tenant.clone(),
|
||||
&mut copyin_reader,
|
||||
base_lsn,
|
||||
self.broker_client.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Read the end of the tar archive.
|
||||
read_tar_eof(copyin_reader).await?;
|
||||
|
||||
// TODO check checksum
|
||||
// Meanwhile you can verify client-side by taking fullbackup
|
||||
// and checking that it matches in size with what was imported.
|
||||
// It wouldn't work if base came from vanilla postgres though,
|
||||
// since we discard some log files.
|
||||
|
||||
info!("done");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
|
||||
async fn handle_import_wal<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if last_record_lsn != start_lsn {
|
||||
return Err(QueryError::Other(
|
||||
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
|
||||
);
|
||||
}
|
||||
|
||||
// TODO leave clean state on error. For now you can use detach to clean
|
||||
// up broken state from a failed import.
|
||||
|
||||
// Import wal provided via CopyData
|
||||
info!("importing wal");
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
|
||||
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
||||
info!("wal import complete");
|
||||
|
||||
// Read the end of the tar archive.
|
||||
read_tar_eof(copyin_reader).await?;
|
||||
|
||||
// TODO Does it make sense to overshoot?
|
||||
if timeline.get_last_record_lsn() < end_lsn {
|
||||
return Err(QueryError::Other(
|
||||
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
|
||||
);
|
||||
}
|
||||
|
||||
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
|
||||
// We only want to persist the data, and it doesn't matter if it's in the
|
||||
// shape of deltas or images.
|
||||
info!("flushing layers");
|
||||
timeline.freeze_and_flush().await.map_err(|e| match e {
|
||||
FlushLayerError::Cancelled => QueryError::Shutdown,
|
||||
other => QueryError::Other(other.into()),
|
||||
})?;
|
||||
|
||||
info!("done");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Helper function to handle the LSN from client request.
|
||||
///
|
||||
/// Each GetPage (and Exists and Nblocks) request includes information about
|
||||
@@ -1705,109 +1434,6 @@ where
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("import basebackup ") {
|
||||
// Import the `base` section (everything but the wal) of a basebackup.
|
||||
// Assumes the tenant already exists on this pageserver.
|
||||
//
|
||||
// Files are scheduled to be persisted to remote storage, and the
|
||||
// caller should poll the http api to check when that is done.
|
||||
//
|
||||
// Example import command:
|
||||
// 1. Get start/end LSN from backup_manifest file
|
||||
// 2. Run:
|
||||
// cat my_backup/base.tar | psql -h $PAGESERVER \
|
||||
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
|
||||
let params = &parts[2..];
|
||||
if params.len() != 5 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for import basebackup command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
let base_lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
let end_lsn = Lsn::from_str(params[3])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
|
||||
let pg_version = u32::from_str(params[4])
|
||||
.with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::ImportBasebackup)
|
||||
.inc();
|
||||
|
||||
match self
|
||||
.handle_import_basebackup(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
base_lsn,
|
||||
end_lsn,
|
||||
pg_version,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else if query_string.starts_with("import wal ") {
|
||||
// Import the `pg_wal` section of a basebackup.
|
||||
//
|
||||
// Files are scheduled to be persisted to remote storage, and the
|
||||
// caller should poll the http api to check when that is done.
|
||||
let params = &parts[2..];
|
||||
if params.len() != 4 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for import wal command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
let start_lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
let end_lsn = Lsn::from_str(params[3])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::ImportWal)
|
||||
.inc();
|
||||
|
||||
match self
|
||||
.handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
@@ -1853,66 +1479,6 @@ where
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else if let Some(params) = parts.strip_prefix(&["show"]) {
|
||||
// show <tenant_id>
|
||||
if params.len() != 1 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for config command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
|
||||
tracing::Span::current().record("tenant_id", field::display(tenant_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Show)
|
||||
.inc();
|
||||
|
||||
let tenant = self
|
||||
.get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::Zero,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
||||
RowDescriptor::int8_col(b"compaction_target_size"),
|
||||
RowDescriptor::int8_col(b"compaction_period"),
|
||||
RowDescriptor::int8_col(b"compaction_threshold"),
|
||||
RowDescriptor::int8_col(b"gc_horizon"),
|
||||
RowDescriptor::int8_col(b"gc_period"),
|
||||
RowDescriptor::int8_col(b"image_creation_threshold"),
|
||||
RowDescriptor::int8_col(b"pitr_interval"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
|
||||
Some(
|
||||
tenant
|
||||
.get_checkpoint_timeout()
|
||||
.as_secs()
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(tenant.get_compaction_target_size().to_string().as_bytes()),
|
||||
Some(
|
||||
tenant
|
||||
.get_compaction_period()
|
||||
.as_secs()
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(tenant.get_compaction_threshold().to_string().as_bytes()),
|
||||
Some(tenant.get_gc_horizon().to_string().as_bytes()),
|
||||
Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
|
||||
Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
|
||||
Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unknown command {query_string}"
|
||||
|
||||
@@ -522,7 +522,7 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<TimestampTz>, PageReconstructError> {
|
||||
let mut max: Option<TimestampTz> = None;
|
||||
self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
|
||||
self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
|
||||
if let Some(max_prev) = max {
|
||||
max = Some(max_prev.max(timestamp));
|
||||
} else {
|
||||
@@ -854,13 +854,14 @@ impl Timeline {
|
||||
result.add_key(DBDIR_KEY);
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
let dbdir = DbDirectory::des(&buf)?;
|
||||
let dbdir = self.list_dbdirs(lsn, ctx).await?;
|
||||
let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
|
||||
|
||||
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
|
||||
dbs.sort_unstable();
|
||||
for (spcnode, dbnode) in dbs {
|
||||
result.add_key(relmap_file_key(spcnode, dbnode));
|
||||
dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
|
||||
for ((spcnode, dbnode), has_relmap_file) in dbs {
|
||||
if has_relmap_file {
|
||||
result.add_key(relmap_file_key(spcnode, dbnode));
|
||||
}
|
||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||
|
||||
let mut rels: Vec<RelTag> = self
|
||||
@@ -919,6 +920,9 @@ impl Timeline {
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
}
|
||||
|
||||
// Add extra keyspaces in the test cases. Some test cases write keys into the storage without
|
||||
// creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
|
||||
// and the keys will not be garbage-colllected.
|
||||
#[cfg(test)]
|
||||
{
|
||||
let guard = self.extra_test_dense_keyspace.load();
|
||||
@@ -927,13 +931,48 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
result.to_keyspace(),
|
||||
/* AUX sparse key space */
|
||||
SparseKeySpace(KeySpace {
|
||||
ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
|
||||
}),
|
||||
))
|
||||
let dense_keyspace = result.to_keyspace();
|
||||
let sparse_keyspace = SparseKeySpace(KeySpace {
|
||||
ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
|
||||
});
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Verify if the sparse keyspaces are ordered and non-overlapping.
|
||||
|
||||
// We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
|
||||
// category of sparse keys are split into their own image/delta files. If there
|
||||
// are overlapping keyspaces, they will be automatically merged by keyspace accum,
|
||||
// and we want the developer to keep the keyspaces separated.
|
||||
|
||||
let ranges = &sparse_keyspace.0.ranges;
|
||||
|
||||
// TODO: use a single overlaps_with across the codebase
|
||||
fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||
!(a.end <= b.start || b.end <= a.start)
|
||||
}
|
||||
for i in 0..ranges.len() {
|
||||
for j in 0..i {
|
||||
if overlaps_with(&ranges[i], &ranges[j]) {
|
||||
panic!(
|
||||
"overlapping sparse keyspace: {}..{} and {}..{}",
|
||||
ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
for i in 1..ranges.len() {
|
||||
assert!(
|
||||
ranges[i - 1].end <= ranges[i].start,
|
||||
"unordered sparse keyspace: {}..{} and {}..{}",
|
||||
ranges[i - 1].start,
|
||||
ranges[i - 1].end,
|
||||
ranges[i].start,
|
||||
ranges[i].end
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((dense_keyspace, sparse_keyspace))
|
||||
}
|
||||
|
||||
/// Get cached size of relation if it not updated after specified LSN
|
||||
@@ -1992,7 +2031,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn aux_files_round_trip() -> anyhow::Result<()> {
|
||||
let name = "aux_files_round_trip";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
let harness = TenantHarness::create(name).await?;
|
||||
|
||||
pub const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
|
||||
@@ -19,9 +19,9 @@ use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
@@ -31,7 +31,6 @@ use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
@@ -41,6 +40,7 @@ use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::backoff;
|
||||
use utils::circuit_breaker::CircuitBreaker;
|
||||
use utils::completion;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::failpoint_support;
|
||||
@@ -78,7 +78,8 @@ use crate::is_uninit_mark;
|
||||
use crate::l0_flush::L0FlushGlobalState;
|
||||
use crate::metrics::TENANT;
|
||||
use crate::metrics::{
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
||||
};
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
@@ -94,12 +95,14 @@ use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::walredo;
|
||||
use crate::InitializationOrder;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::Debug;
|
||||
use std::fmt::Display;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::ops::Bound::Included;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
@@ -276,6 +279,10 @@ pub struct Tenant {
|
||||
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
|
||||
/// Track repeated failures to compact, so that we can back off.
|
||||
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
|
||||
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
|
||||
|
||||
/// If the tenant is in Activating state, notify this to encourage it
|
||||
/// to proceed to Active as soon as possible, rather than waiting for lazy
|
||||
/// background warmup.
|
||||
@@ -632,11 +639,6 @@ impl Tenant {
|
||||
timeline.maybe_spawn_flush_loop();
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(ancestor) = timeline.get_ancestor_timeline() {
|
||||
let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
|
||||
ancestor_gc_info.insert_child(timeline.timeline_id, timeline.get_ancestor_lsn());
|
||||
}
|
||||
};
|
||||
|
||||
// Sanity check: a timeline should have some content.
|
||||
@@ -1227,6 +1229,14 @@ impl Tenant {
|
||||
Ok(timeline_preloads)
|
||||
}
|
||||
|
||||
pub async fn apply_timeline_archival_config(
|
||||
&self,
|
||||
_timeline_id: TimelineId,
|
||||
_config: TimelineArchivalState,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
|
||||
self.tenant_shard_id
|
||||
}
|
||||
@@ -1646,13 +1656,31 @@ impl Tenant {
|
||||
timelines_to_compact
|
||||
};
|
||||
|
||||
// Before doing any I/O work, check our circuit breaker
|
||||
if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
|
||||
info!("Skipping compaction due to previous failures");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for (timeline_id, timeline) in &timelines_to_compact {
|
||||
timeline
|
||||
.compact(cancel, EnumSet::empty(), ctx)
|
||||
.instrument(info_span!("compact_timeline", %timeline_id))
|
||||
.await?;
|
||||
.await
|
||||
.map_err(|e| {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, &e);
|
||||
e
|
||||
})?;
|
||||
}
|
||||
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.success(&CIRCUIT_BREAKERS_UNBROKEN);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1738,9 +1766,6 @@ impl Tenant {
|
||||
.values()
|
||||
.filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));
|
||||
|
||||
// Before activation, populate each Timeline's GcInfo with information about its children
|
||||
self.initialize_gc_info(&timelines_accessor);
|
||||
|
||||
// Spawn gc and compaction loops. The loops will shut themselves
|
||||
// down when they notice that the tenant is inactive.
|
||||
tasks::start_background_loops(self, background_jobs_can_start);
|
||||
@@ -2349,13 +2374,6 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
||||
}
|
||||
|
||||
pub fn get_trace_read_requests(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.trace_read_requests
|
||||
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
||||
}
|
||||
|
||||
pub fn get_min_resident_size_override(&self) -> Option<u64> {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
@@ -2578,6 +2596,14 @@ impl Tenant {
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
|
||||
format!("compaction-{tenant_shard_id}"),
|
||||
5,
|
||||
// Compaction can be a very expensive operation, and might leak disk space. It also ought
|
||||
// to be infallible, as long as remote storage is available. So if it repeatedly fails,
|
||||
// use an extremely long backoff.
|
||||
Some(Duration::from_secs(3600 * 24)),
|
||||
)),
|
||||
activate_now_sem: tokio::sync::Semaphore::new(0),
|
||||
cancel: CancellationToken::default(),
|
||||
gate: Gate::default(),
|
||||
@@ -2773,56 +2799,6 @@ impl Tenant {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Populate all Timelines' `GcInfo` with information about their children. We do not set the
|
||||
/// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`]
|
||||
///
|
||||
/// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion.
|
||||
fn initialize_gc_info(
|
||||
&self,
|
||||
timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
|
||||
) {
|
||||
// This function must be called before activation: after activation timeline create/delete operations
|
||||
// might happen, and this function is not safe to run concurrently with those.
|
||||
assert!(!self.is_active());
|
||||
|
||||
// Scan all timelines. For each timeline, remember the timeline ID and
|
||||
// the branch point where it was created.
|
||||
let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId, Option<KeySpace>)>> =
|
||||
BTreeMap::new();
|
||||
timelines.iter().for_each(|(timeline_id, timeline_entry)| {
|
||||
if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
|
||||
let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
|
||||
ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id, None));
|
||||
}
|
||||
});
|
||||
|
||||
// The number of bytes we always keep, irrespective of PITR: this is a constant across timelines
|
||||
let horizon = self.get_gc_horizon();
|
||||
|
||||
// Populate each timeline's GcInfo with information about its child branches
|
||||
for timeline in timelines.values() {
|
||||
let mut branchpoints: Vec<(Lsn, TimelineId, Option<KeySpace>)> = all_branchpoints
|
||||
.remove(&timeline.timeline_id)
|
||||
.unwrap_or_default();
|
||||
|
||||
branchpoints.sort_by_key(|b| b.0);
|
||||
|
||||
let mut target = timeline.gc_info.write().unwrap();
|
||||
|
||||
target.retain_lsns = branchpoints;
|
||||
|
||||
let horizon_cutoff = timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(horizon)
|
||||
.unwrap_or(Lsn(0));
|
||||
|
||||
target.cutoffs = GcCutoffs {
|
||||
horizon: horizon_cutoff,
|
||||
pitr: Lsn::INVALID,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async fn refresh_gc_info_internal(
|
||||
&self,
|
||||
target_timeline_id: Option<TimelineId>,
|
||||
@@ -2845,11 +2821,6 @@ impl Tenant {
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if target_timeline_id.is_some() && timelines.is_empty() {
|
||||
// We were to act on a particular timeline and it wasn't found
|
||||
return Err(GcError::TimelineNotFound);
|
||||
}
|
||||
|
||||
let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
|
||||
HashMap::with_capacity(timelines.len());
|
||||
|
||||
@@ -2872,15 +2843,68 @@ impl Tenant {
|
||||
// because that will stall branch creation.
|
||||
let gc_cs = self.gc_cs.lock().await;
|
||||
|
||||
// Scan all timelines. For each timeline, remember the timeline ID and
|
||||
// the branch point where it was created.
|
||||
let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let mut all_branchpoints = BTreeSet::new();
|
||||
let timelines = {
|
||||
if let Some(target_timeline_id) = target_timeline_id.as_ref() {
|
||||
if timelines.get(target_timeline_id).is_none() {
|
||||
return Err(GcError::TimelineNotFound);
|
||||
}
|
||||
};
|
||||
|
||||
timelines
|
||||
.iter()
|
||||
.map(|(_timeline_id, timeline_entry)| {
|
||||
if let Some(ancestor_timeline_id) =
|
||||
&timeline_entry.get_ancestor_timeline_id()
|
||||
{
|
||||
// If target_timeline is specified, we only need to know branchpoints of its children
|
||||
if let Some(timeline_id) = target_timeline_id {
|
||||
if ancestor_timeline_id == &timeline_id {
|
||||
all_branchpoints.insert((
|
||||
*ancestor_timeline_id,
|
||||
timeline_entry.get_ancestor_lsn(),
|
||||
));
|
||||
}
|
||||
}
|
||||
// Collect branchpoints for all timelines
|
||||
else {
|
||||
all_branchpoints.insert((
|
||||
*ancestor_timeline_id,
|
||||
timeline_entry.get_ancestor_lsn(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
timeline_entry.clone()
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
(all_branchpoints, timelines)
|
||||
};
|
||||
|
||||
// Ok, we now know all the branch points.
|
||||
// Update the GC information for each timeline.
|
||||
let mut gc_timelines = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
// We filtered the timeline list above
|
||||
// If target_timeline is specified, ignore all other timelines
|
||||
if let Some(target_timeline_id) = target_timeline_id {
|
||||
assert_eq!(target_timeline_id, timeline.timeline_id);
|
||||
if timeline.timeline_id != target_timeline_id {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||
.range((
|
||||
Included((timeline.timeline_id, Lsn(0))),
|
||||
Included((timeline.timeline_id, Lsn(u64::MAX))),
|
||||
))
|
||||
.map(|&x| x.1)
|
||||
.collect();
|
||||
|
||||
{
|
||||
let mut target = timeline.gc_info.write().unwrap();
|
||||
|
||||
@@ -2897,7 +2921,7 @@ impl Tenant {
|
||||
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
|
||||
if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
|
||||
target.within_ancestor_pitr =
|
||||
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
|
||||
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2913,17 +2937,25 @@ impl Tenant {
|
||||
timeline.metrics.pitr_history_size.set(
|
||||
timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(target.cutoffs.pitr)
|
||||
.checked_sub(target.cutoffs.time)
|
||||
.unwrap_or(Lsn(0))
|
||||
.0,
|
||||
);
|
||||
|
||||
// Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline?
|
||||
// - this timeline was created while we were finding cutoffs
|
||||
// - lsn for timestamp search fails for this timeline repeatedly
|
||||
if let Some(cutoffs) = gc_cutoffs.remove(&timeline.timeline_id) {
|
||||
target.cutoffs = cutoffs;
|
||||
}
|
||||
match gc_cutoffs.remove(&timeline.timeline_id) {
|
||||
Some(cutoffs) => {
|
||||
target.retain_lsns = branchpoints;
|
||||
target.cutoffs = cutoffs;
|
||||
}
|
||||
None => {
|
||||
// reasons for this being unavailable:
|
||||
// - this timeline was created while we were finding cutoffs
|
||||
// - lsn for timestamp search fails for this timeline repeatedly
|
||||
//
|
||||
// in both cases, refreshing the branchpoints is correct.
|
||||
target.retain_lsns = branchpoints;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
gc_timelines.push(timeline);
|
||||
@@ -3720,7 +3752,6 @@ pub(crate) mod harness {
|
||||
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
|
||||
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
||||
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
||||
trace_read_requests: Some(tenant_conf.trace_read_requests),
|
||||
eviction_policy: Some(tenant_conf.eviction_policy),
|
||||
min_resident_size_override: tenant_conf.min_resident_size_override,
|
||||
evictions_low_residence_duration_metric_threshold: Some(
|
||||
@@ -3766,7 +3797,7 @@ pub(crate) mod harness {
|
||||
}
|
||||
|
||||
impl TenantHarness {
|
||||
pub fn create_custom(
|
||||
pub async fn create_custom(
|
||||
test_name: &'static str,
|
||||
tenant_conf: TenantConf,
|
||||
tenant_id: TenantId,
|
||||
@@ -3802,7 +3833,7 @@ pub(crate) mod harness {
|
||||
},
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
|
||||
let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
|
||||
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
|
||||
|
||||
Ok(Self {
|
||||
@@ -3817,7 +3848,7 @@ pub(crate) mod harness {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
pub async fn create(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
// Disable automatic GC and compaction to make the unit tests more deterministic.
|
||||
// The tests perform them manually if needed.
|
||||
let tenant_conf = TenantConf {
|
||||
@@ -3834,6 +3865,7 @@ pub(crate) mod harness {
|
||||
shard,
|
||||
Generation::new(0xdeadbeef),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn span(&self) -> tracing::Span {
|
||||
@@ -3970,7 +4002,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_basic() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4017,7 +4049,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_duplicate_timelines() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
|
||||
let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let _ = tenant
|
||||
@@ -4049,7 +4082,7 @@ mod tests {
|
||||
async fn test_branch() -> anyhow::Result<()> {
|
||||
use std::str::from_utf8;
|
||||
|
||||
let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4171,7 +4204,8 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
@@ -4218,7 +4252,8 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
|
||||
@@ -4240,7 +4275,7 @@ mod tests {
|
||||
.source()
|
||||
.unwrap()
|
||||
.to_string()
|
||||
.contains("is earlier than latest GC horizon"));
|
||||
.contains("is earlier than latest GC cutoff"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4273,7 +4308,8 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?
|
||||
TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
@@ -4307,7 +4343,7 @@ mod tests {
|
||||
{
|
||||
let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
|
||||
assert_eq!(branchpoints.len(), 1);
|
||||
assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID, None));
|
||||
assert_eq!(branchpoints[0], Lsn(0x40));
|
||||
}
|
||||
|
||||
// You can read the key from the child branch even though the parent is
|
||||
@@ -4330,7 +4366,8 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
|
||||
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
@@ -4360,10 +4397,10 @@ mod tests {
|
||||
}
|
||||
#[tokio::test]
|
||||
async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
|
||||
.load()
|
||||
.await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4401,7 +4438,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn timeline_load() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
let harness = TenantHarness::create(TEST_NAME).await?;
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
@@ -4428,7 +4465,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load_with_ancestor";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
let harness = TenantHarness::create(TEST_NAME).await?;
|
||||
// create two timelines
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
@@ -4476,7 +4513,10 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn delta_layer_dumping() -> anyhow::Result<()> {
|
||||
use storage_layer::AsLayerDesc;
|
||||
let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_layer_dumping")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4503,7 +4543,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_images() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -4674,7 +4714,7 @@ mod tests {
|
||||
//
|
||||
#[tokio::test]
|
||||
async fn test_bulk_insert() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_bulk_insert")?;
|
||||
let harness = TenantHarness::create("test_bulk_insert").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -4705,7 +4745,7 @@ mod tests {
|
||||
// so the search can stop at the first delta layer and doesn't traverse any deeper.
|
||||
#[tokio::test]
|
||||
async fn test_get_vectored() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored")?;
|
||||
let harness = TenantHarness::create("test_get_vectored").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -4783,7 +4823,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored_aux_files")?;
|
||||
let harness = TenantHarness::create("test_get_vectored_aux_files").await?;
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
@@ -4869,7 +4909,8 @@ mod tests {
|
||||
TenantId::generate(),
|
||||
ShardIdentity::unsharded(),
|
||||
Generation::new(0xdeadbeef),
|
||||
)?;
|
||||
)
|
||||
.await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
@@ -5012,7 +5053,7 @@ mod tests {
|
||||
// ```
|
||||
#[tokio::test]
|
||||
async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
|
||||
let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
@@ -5161,7 +5202,7 @@ mod tests {
|
||||
name: &'static str,
|
||||
compaction_algorithm: CompactionAlgorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut harness = TenantHarness::create(name)?;
|
||||
let mut harness = TenantHarness::create(name).await?;
|
||||
harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
|
||||
kind: compaction_algorithm,
|
||||
};
|
||||
@@ -5245,7 +5286,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_traverse_branches() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
|
||||
let (tenant, ctx) = TenantHarness::create("test_traverse_branches")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let mut tline = tenant
|
||||
@@ -5335,7 +5377,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_traverse_ancestors() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
|
||||
let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
let mut tline = tenant
|
||||
@@ -5401,7 +5444,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")?
|
||||
let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
|
||||
@@ -5470,7 +5514,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_create_guard_crash() -> anyhow::Result<()> {
|
||||
let name = "test_create_guard_crash";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
let harness = TenantHarness::create(name).await?;
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
@@ -5523,7 +5567,7 @@ mod tests {
|
||||
name: &'static str,
|
||||
compaction_algorithm: CompactionAlgorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut harness = TenantHarness::create(name)?;
|
||||
let mut harness = TenantHarness::create(name).await?;
|
||||
harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
|
||||
kind: compaction_algorithm,
|
||||
};
|
||||
@@ -5547,7 +5591,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_scan() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_scan")?;
|
||||
let harness = TenantHarness::create("test_metadata_scan").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -5666,7 +5710,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
|
||||
let harness = TenantHarness::create("test_metadata_compaction_trigger").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -5725,7 +5769,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_branch_copies_dirty_aux_file_flag() {
|
||||
let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
|
||||
let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// the default aux file policy to switch is v1 if not set by the admins
|
||||
assert_eq!(
|
||||
@@ -5827,7 +5873,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_switch() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
|
||||
let mut harness = TenantHarness::create("aux_file_policy_switch")
|
||||
.await
|
||||
.unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -6001,7 +6049,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_force_switch() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
|
||||
let mut harness = TenantHarness::create("aux_file_policy_force_switch")
|
||||
.await
|
||||
.unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -6062,7 +6112,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_auto_detect() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
|
||||
let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
|
||||
.await
|
||||
.unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -6125,7 +6177,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_image_creation() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_image_creation")?;
|
||||
let harness = TenantHarness::create("test_metadata_image_creation").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -6224,7 +6276,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
|
||||
let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
@@ -6296,7 +6348,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
|
||||
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
@@ -6388,7 +6440,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_reads").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
@@ -6468,7 +6520,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_image_creation() {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_image_creation")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
@@ -6540,8 +6594,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_empty_image_creation() {
|
||||
let harness =
|
||||
TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
@@ -6604,7 +6659,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
@@ -6696,8 +6751,8 @@ mod tests {
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.pitr = Lsn(0x30);
|
||||
guard.cutoffs.horizon = Lsn(0x30);
|
||||
guard.cutoffs.time = Lsn(0x30);
|
||||
guard.cutoffs.space = Lsn(0x30);
|
||||
}
|
||||
|
||||
let expected_result = [
|
||||
@@ -6788,7 +6843,7 @@ mod tests {
|
||||
vec![
|
||||
// Image layer at GC horizon
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..get_key(10),
|
||||
key_range: Key::MIN..Key::MAX,
|
||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||
is_delta: false
|
||||
},
|
||||
@@ -6812,7 +6867,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_neon_test_record() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_neon_test_record")?;
|
||||
let harness = TenantHarness::create("test_neon_test_record").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
@@ -6893,7 +6948,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_lsn_lease() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
|
||||
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
|
||||
let end_lsn = Lsn(0x100);
|
||||
@@ -6982,7 +7037,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?;
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
@@ -7087,8 +7142,8 @@ mod tests {
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![],
|
||||
cutoffs: GcCutoffs {
|
||||
pitr: Lsn(0x30),
|
||||
horizon: Lsn(0x30),
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
|
||||
@@ -137,14 +137,14 @@ impl<'a> BlockCursor<'a> {
|
||||
}
|
||||
|
||||
/// Reserved bits for length and compression
|
||||
const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
|
||||
pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
|
||||
|
||||
/// The maximum size of blobs we support. The highest few bits
|
||||
/// are reserved for compression and other further uses.
|
||||
const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
|
||||
|
||||
const BYTE_UNCOMPRESSED: u8 = 0x80;
|
||||
const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
|
||||
pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
|
||||
pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
|
||||
|
||||
/// A wrapper of `VirtualFile` that allows users to write blobs.
|
||||
///
|
||||
@@ -273,12 +273,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
srcbuf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<u64, Error>) {
|
||||
self.write_blob_maybe_compressed(
|
||||
srcbuf,
|
||||
ctx,
|
||||
ImageCompressionAlgorithm::DisabledNoDecompress,
|
||||
)
|
||||
.await
|
||||
self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Write a blob of data. Returns the offset that it was written to,
|
||||
@@ -340,8 +336,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
(BYTE_UNCOMPRESSED, len, slice.into_inner())
|
||||
}
|
||||
}
|
||||
ImageCompressionAlgorithm::Disabled
|
||||
| ImageCompressionAlgorithm::DisabledNoDecompress => {
|
||||
ImageCompressionAlgorithm::Disabled => {
|
||||
(BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
|
||||
}
|
||||
};
|
||||
@@ -395,51 +390,63 @@ impl BlobWriter<false> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
pub(crate) mod tests {
|
||||
use super::*;
|
||||
use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
|
||||
round_trip_test_compressed::<BUFFERED>(blobs, false).await
|
||||
}
|
||||
|
||||
async fn round_trip_test_compressed<const BUFFERED: bool>(
|
||||
pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
|
||||
blobs: &[Vec<u8>],
|
||||
compression: bool,
|
||||
) -> Result<(), Error> {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
|
||||
let temp_dir = camino_tempfile::tempdir()?;
|
||||
let pathbuf = temp_dir.path().join("file");
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
|
||||
// Write part (in block to drop the file)
|
||||
let mut offsets = Vec::new();
|
||||
{
|
||||
let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
|
||||
let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||
for blob in blobs.iter() {
|
||||
let (_, res) = if compression {
|
||||
wtr.write_blob_maybe_compressed(
|
||||
blob.clone(),
|
||||
&ctx,
|
||||
ctx,
|
||||
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
wtr.write_blob(blob.clone(), &ctx).await
|
||||
wtr.write_blob(blob.clone(), ctx).await
|
||||
};
|
||||
let offs = res?;
|
||||
offsets.push(offs);
|
||||
}
|
||||
// Write out one page worth of zeros so that we can
|
||||
// read again with read_blk
|
||||
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
|
||||
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
|
||||
let offs = res?;
|
||||
println!("Writing final blob at offs={offs}");
|
||||
wtr.flush_buffer(&ctx).await?;
|
||||
wtr.flush_buffer(ctx).await?;
|
||||
}
|
||||
Ok((temp_dir, pathbuf, offsets))
|
||||
}
|
||||
|
||||
let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
|
||||
async fn round_trip_test_compressed<const BUFFERED: bool>(
|
||||
blobs: &[Vec<u8>],
|
||||
compression: bool,
|
||||
) -> Result<(), Error> {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let (_temp_dir, pathbuf, offsets) =
|
||||
write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
|
||||
|
||||
let file = VirtualFile::open(pathbuf, &ctx).await?;
|
||||
let rdr = BlockReaderRef::VirtualFile(&file);
|
||||
let rdr = BlockCursor::new_with_compression(rdr, compression);
|
||||
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
|
||||
@@ -452,7 +459,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn random_array(len: usize) -> Vec<u8> {
|
||||
pub(crate) fn random_array(len: usize) -> Vec<u8> {
|
||||
let mut rng = rand::thread_rng();
|
||||
(0..len).map(|_| rng.gen()).collect::<_>()
|
||||
}
|
||||
|
||||
@@ -202,18 +202,10 @@ pub struct FileBlockReader<'a> {
|
||||
|
||||
impl<'a> FileBlockReader<'a> {
|
||||
pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
|
||||
Self::new_with_compression(file, file_id, false)
|
||||
}
|
||||
|
||||
pub fn new_with_compression(
|
||||
file: &'a VirtualFile,
|
||||
file_id: FileId,
|
||||
compressed_reads: bool,
|
||||
) -> Self {
|
||||
FileBlockReader {
|
||||
file_id,
|
||||
file,
|
||||
compressed_reads,
|
||||
compressed_reads: true,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -335,7 +335,6 @@ pub struct TenantConf {
|
||||
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
|
||||
/// to avoid eager reconnects.
|
||||
pub max_lsn_wal_lag: NonZeroU64,
|
||||
pub trace_read_requests: bool,
|
||||
pub eviction_policy: EvictionPolicy,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
// See the corresponding metric's help string.
|
||||
@@ -436,10 +435,6 @@ pub struct TenantConfOpt {
|
||||
#[serde(default)]
|
||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub trace_read_requests: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub eviction_policy: Option<EvictionPolicy>,
|
||||
@@ -519,9 +514,6 @@ impl TenantConfOpt {
|
||||
.lagging_wal_timeout
|
||||
.unwrap_or(global_conf.lagging_wal_timeout),
|
||||
max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
|
||||
trace_read_requests: self
|
||||
.trace_read_requests
|
||||
.unwrap_or(global_conf.trace_read_requests),
|
||||
eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
|
||||
min_resident_size_override: self
|
||||
.min_resident_size_override
|
||||
@@ -581,7 +573,6 @@ impl Default for TenantConf {
|
||||
.expect("cannot parse default walreceiver lagging wal timeout"),
|
||||
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||
trace_read_requests: false,
|
||||
eviction_policy: EvictionPolicy::NoEviction,
|
||||
min_resident_size_override: None,
|
||||
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
||||
@@ -659,7 +650,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
|
||||
lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
|
||||
max_lsn_wal_lag: value.max_lsn_wal_lag,
|
||||
trace_read_requests: value.trace_read_requests,
|
||||
eviction_policy: value.eviction_policy,
|
||||
min_resident_size_override: value.min_resident_size_override,
|
||||
evictions_low_residence_duration_metric_threshold: value
|
||||
|
||||
@@ -262,7 +262,7 @@ where
|
||||
|
||||
pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
|
||||
where
|
||||
R: 'a,
|
||||
R: 'a + Send,
|
||||
{
|
||||
DiskBtreeIterator {
|
||||
stream: Box::pin(self.into_stream(start_key, ctx)),
|
||||
@@ -521,7 +521,7 @@ where
|
||||
pub struct DiskBtreeIterator<'a> {
|
||||
#[allow(clippy::type_complexity)]
|
||||
stream: std::pin::Pin<
|
||||
Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
|
||||
Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a + Send>,
|
||||
>,
|
||||
}
|
||||
|
||||
@@ -550,10 +550,10 @@ where
|
||||
/// We maintain the length of the stack to be always greater than zero.
|
||||
/// Two exceptions are:
|
||||
/// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
|
||||
/// So because other methods cannot see the intermediate state invariant still holds.
|
||||
/// So because other methods cannot see the intermediate state invariant still holds.
|
||||
/// 2. `Self::finish`. It consumes self and does not return it back,
|
||||
/// which means that this is where the structure is destroyed.
|
||||
/// Thus stack of zero length cannot be observed by other methods.
|
||||
/// which means that this is where the structure is destroyed.
|
||||
/// Thus stack of zero length cannot be observed by other methods.
|
||||
stack: Vec<BuildNode<L>>,
|
||||
|
||||
/// Last key that was appended to the tree. Used to sanity check that append
|
||||
|
||||
@@ -51,7 +51,7 @@ use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum, KeySpaceRandomAccum};
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Range;
|
||||
@@ -61,7 +61,7 @@ use utils::lsn::Lsn;
|
||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||
pub use historic_layer_coverage::LayerKey;
|
||||
|
||||
use super::storage_layer::{LayerVisibility, PersistentLayerDesc};
|
||||
use super::storage_layer::PersistentLayerDesc;
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
@@ -870,164 +870,6 @@ impl LayerMap {
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// `read_points` represent the tip of a timeline and any branch points, i.e. the places
|
||||
/// where we expect to serve reads.
|
||||
///
|
||||
/// This function is O(N) and should be called infrequently. The caller is responsible for
|
||||
/// looking up and updating the Layer objects for these layer descriptors.
|
||||
pub(crate) fn get_visibility(
|
||||
&self,
|
||||
mut read_points: Vec<(Lsn, KeySpace)>,
|
||||
) -> (Vec<(Arc<PersistentLayerDesc>, LayerVisibility)>, KeySpace) {
|
||||
// This is like a KeySpace, but written for efficient subtraction of layers and unions with KeySpaces
|
||||
struct KeyShadow {
|
||||
// FIXME: consider efficiency. KeySpace is a flat vector, so in principle fairly inefficient for
|
||||
// repeatedly calling contains(), BUT as we iterate through the layermap we expect the shadow to shrink
|
||||
// to something quite small, and for small collections an algorithmically expensive vector is often better
|
||||
// for performance than a more algorithmically cheap data structure.
|
||||
inner: KeySpace,
|
||||
}
|
||||
|
||||
impl KeyShadow {
|
||||
fn new(keyspace: KeySpace) -> Self {
|
||||
Self { inner: keyspace }
|
||||
}
|
||||
|
||||
fn contains(&self, range: Range<Key>) -> bool {
|
||||
self.inner.overlaps(&range)
|
||||
}
|
||||
|
||||
/// Return true if anything was removed.
|
||||
fn subtract(&mut self, range: Range<Key>) -> bool {
|
||||
let removed = self.inner.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![range],
|
||||
});
|
||||
!removed.ranges.is_empty()
|
||||
}
|
||||
|
||||
fn union_with(&mut self, keyspace: KeySpace) {
|
||||
let mut accum = KeySpaceRandomAccum::new();
|
||||
let prev = std::mem::take(&mut self.inner);
|
||||
accum.add_keyspace(prev);
|
||||
accum.add_keyspace(keyspace);
|
||||
self.inner = accum.to_keyspace();
|
||||
}
|
||||
}
|
||||
|
||||
// The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
|
||||
// and a ReadPoint
|
||||
read_points.sort_by_key(|rp| rp.0);
|
||||
let mut shadow = KeyShadow::new(
|
||||
read_points
|
||||
.pop()
|
||||
.expect("Every timeline has at least one read point")
|
||||
.1,
|
||||
);
|
||||
|
||||
// We will interleave all our read points and layers into a sorted collection
|
||||
enum Item {
|
||||
ReadPoint { lsn: Lsn, keyspace: KeySpace },
|
||||
Layer(Arc<PersistentLayerDesc>),
|
||||
}
|
||||
|
||||
let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
|
||||
items.extend(self.iter_historic_layers().map(Item::Layer));
|
||||
items.extend(read_points.into_iter().map(|rp| Item::ReadPoint {
|
||||
lsn: rp.0,
|
||||
keyspace: rp.1,
|
||||
}));
|
||||
|
||||
// Ordering: we want to iterate like this:
|
||||
// 1. Highest LSNs first
|
||||
// 2. Consider ReadPoints before image layers if they're at the same LSN
|
||||
items.sort_by_key(|item| {
|
||||
std::cmp::Reverse(match item {
|
||||
Item::ReadPoint {
|
||||
lsn,
|
||||
keyspace: _keyspace,
|
||||
} => (*lsn, 0),
|
||||
Item::Layer(layer) => {
|
||||
if layer.is_delta() {
|
||||
(layer.get_lsn_range().end, 1)
|
||||
} else {
|
||||
(layer.image_layer_lsn(), 2)
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
let mut results = Vec::with_capacity(self.historic.len());
|
||||
|
||||
// TODO: handle delta layers properly with multiple read points: if a read point intersects a delta layer, we might already
|
||||
// have encountered it and marked it as not-visible. We need to keep track of which delta layers we are currently within, and
|
||||
// when we encounter a ReadPoint, update the delta layer's visibility as needed.
|
||||
// let mut pending_delta : Vec= ...
|
||||
let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
|
||||
|
||||
for item in items {
|
||||
let (reached_lsn, is_readpoint) = match &item {
|
||||
Item::ReadPoint {
|
||||
lsn,
|
||||
keyspace: _keyspace,
|
||||
} => (lsn, true),
|
||||
Item::Layer(layer) => (&layer.lsn_range.start, false),
|
||||
};
|
||||
maybe_covered_deltas.retain(|d| {
|
||||
if *reached_lsn >= d.lsn_range.start && is_readpoint {
|
||||
// We encountered a readpoint within the delta layer: it is visible
|
||||
results.push((d.clone(), LayerVisibility::Visible));
|
||||
false
|
||||
} else if *reached_lsn < d.lsn_range.start {
|
||||
// We passed the layer's range without encountering a read point: it is not visible
|
||||
results.push((d.clone(), LayerVisibility::Covered));
|
||||
false
|
||||
} else {
|
||||
// We're still in the delta layer: continue iterating
|
||||
true
|
||||
}
|
||||
});
|
||||
|
||||
match item {
|
||||
Item::ReadPoint {
|
||||
lsn: _lsn,
|
||||
keyspace,
|
||||
} => {
|
||||
shadow.union_with(keyspace);
|
||||
}
|
||||
Item::Layer(layer) => {
|
||||
let visibility = if layer.is_delta() {
|
||||
if shadow.contains(layer.get_key_range()) {
|
||||
LayerVisibility::Visible
|
||||
} else {
|
||||
// If a layer isn't visible based on current state, we must defer deciding whether
|
||||
// it is truly not visible until we have advanced past the delta's range: we might
|
||||
// encounter another branch point within this delta layer's LSN range.
|
||||
maybe_covered_deltas.push(layer);
|
||||
continue;
|
||||
}
|
||||
} else if shadow.subtract(layer.get_key_range()) {
|
||||
// An image layer, which overlapped with the shadow
|
||||
LayerVisibility::Visible
|
||||
} else {
|
||||
// An image layer, which did not overlap with the shadow
|
||||
LayerVisibility::Covered
|
||||
};
|
||||
|
||||
results.push((layer, visibility));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Drain any remaining maybe_covered deltas
|
||||
results.extend(
|
||||
maybe_covered_deltas
|
||||
.into_iter()
|
||||
.map(|d| (d, LayerVisibility::Covered)),
|
||||
);
|
||||
|
||||
(results, shadow.inner)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -521,10 +521,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
|
||||
Ok(&self.historic_coverage)
|
||||
}
|
||||
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.layers.len()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -2698,7 +2698,9 @@ mod tests {
|
||||
// Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
|
||||
// wait for it to complete before proceeding.
|
||||
|
||||
let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
|
||||
let h = TenantHarness::create("shutdown_awaits_in_progress_tenant")
|
||||
.await
|
||||
.unwrap();
|
||||
let (t, _ctx) = h.load().await;
|
||||
|
||||
// harness loads it to active, which is forced and nothing is running on the tenant
|
||||
|
||||
@@ -241,7 +241,7 @@ use self::index::IndexPart;
|
||||
|
||||
use super::metadata::MetadataUpdate;
|
||||
use super::storage_layer::{Layer, LayerName, ResidentLayer};
|
||||
use super::upload_queue::SetDeletedFlagProgress;
|
||||
use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
|
||||
use super::Generation;
|
||||
|
||||
pub(crate) use download::{
|
||||
@@ -1930,6 +1930,31 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue
|
||||
/// externally to RemoteTimelineClient.
|
||||
pub(crate) fn initialized_upload_queue(
|
||||
&self,
|
||||
) -> Result<UploadQueueAccessor<'_>, NotInitialized> {
|
||||
let mut inner = self.upload_queue.lock().unwrap();
|
||||
inner.initialized_mut()?;
|
||||
Ok(UploadQueueAccessor { inner })
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct UploadQueueAccessor<'a> {
|
||||
inner: std::sync::MutexGuard<'a, UploadQueue>,
|
||||
}
|
||||
|
||||
impl<'a> UploadQueueAccessor<'a> {
|
||||
pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
|
||||
match &*self.inner {
|
||||
UploadQueue::Initialized(x) => &x.clean.0,
|
||||
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
|
||||
unreachable!("checked before constructing")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
@@ -2103,7 +2128,7 @@ mod tests {
|
||||
impl TestSetup {
|
||||
async fn new(test_name: &str) -> anyhow::Result<Self> {
|
||||
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let harness = TenantHarness::create(test_name).await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
|
||||
@@ -176,6 +176,24 @@ pub(crate) struct Lineage {
|
||||
///
|
||||
/// If you are adding support for detaching from a hierarchy, consider changing the ancestry
|
||||
/// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
|
||||
// FIXME: this is insufficient even for path of two timelines for future wal recovery
|
||||
// purposes:
|
||||
//
|
||||
// assuming a "old main" which has received most of the WAL, and has a branch "new main",
|
||||
// starting a bit before "old main" last_record_lsn. the current version works fine,
|
||||
// because we will know to replay wal and branch at the recorded Lsn to do wal recovery.
|
||||
//
|
||||
// then assuming "new main" would similarly receive a branch right before its last_record_lsn,
|
||||
// "new new main". the current implementation would just store ("new main", ancestor_lsn, _)
|
||||
// here. however, we cannot recover from WAL using only that information, we would need the
|
||||
// whole ancestry here:
|
||||
//
|
||||
// ```json
|
||||
// [
|
||||
// ["old main", ancestor_lsn("new main"), _],
|
||||
// ["new main", ancestor_lsn("new new main"), _]
|
||||
// ]
|
||||
// ```
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
|
||||
}
|
||||
@@ -217,6 +235,14 @@ impl Lineage {
|
||||
self.original_ancestor
|
||||
.is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
|
||||
}
|
||||
|
||||
pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
|
||||
self.original_ancestor.is_some()
|
||||
}
|
||||
|
||||
pub(crate) fn is_reparented(&self) -> bool {
|
||||
!self.reparenting_history.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -135,11 +135,9 @@ pub struct TimelineInputs {
|
||||
ancestor_lsn: Lsn,
|
||||
last_record: Lsn,
|
||||
latest_gc_cutoff: Lsn,
|
||||
horizon_cutoff: Lsn,
|
||||
pitr_cutoff: Lsn,
|
||||
|
||||
/// Cutoff point based on GC settings
|
||||
next_gc_cutoff: Lsn,
|
||||
next_pitr_cutoff: Lsn,
|
||||
|
||||
/// Cutoff point calculated from the user-supplied 'max_retention_period'
|
||||
retention_param_cutoff: Option<Lsn>,
|
||||
@@ -150,7 +148,7 @@ pub struct TimelineInputs {
|
||||
|
||||
/// Gathers the inputs for the tenant sizing model.
|
||||
///
|
||||
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
||||
/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which
|
||||
/// is updated on-demand, during the start of this calculation and separate from the
|
||||
/// [`TimelineInputs::latest_gc_cutoff`].
|
||||
///
|
||||
@@ -158,11 +156,8 @@ pub struct TimelineInputs {
|
||||
///
|
||||
/// ```text
|
||||
/// 0-----|---------|----|------------| · · · · · |·> lsn
|
||||
/// initdb_lsn branchpoints* next_gc_cutoff latest
|
||||
/// initdb_lsn branchpoints* next_pitr_cutoff latest
|
||||
/// ```
|
||||
///
|
||||
/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
|
||||
/// tenant size will be zero.
|
||||
pub(super) async fn gather_inputs(
|
||||
tenant: &Tenant,
|
||||
limit: &Arc<Semaphore>,
|
||||
@@ -172,7 +167,7 @@ pub(super) async fn gather_inputs(
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ModelInputs, CalculateSyntheticSizeError> {
|
||||
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
||||
// refresh is needed to update [`timeline::GcCutoffs`]
|
||||
tenant.refresh_gc_info(cancel, ctx).await?;
|
||||
|
||||
// Collect information about all the timelines
|
||||
@@ -236,20 +231,18 @@ pub(super) async fn gather_inputs(
|
||||
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
|
||||
// actually removing files.
|
||||
//
|
||||
// We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
|
||||
// We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from
|
||||
// a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
|
||||
// than a space bound (horizon cutoff). This means that if someone drops a database and waits for their
|
||||
// than our internal space cutoff. This means that if someone drops a database and waits for their
|
||||
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
|
||||
// horizon_cutoff.
|
||||
let pitr_cutoff = gc_info.cutoffs.pitr;
|
||||
let horizon_cutoff = gc_info.cutoffs.horizon;
|
||||
let mut next_gc_cutoff = pitr_cutoff;
|
||||
// the space cutoff.
|
||||
let mut next_pitr_cutoff = gc_info.cutoffs.time;
|
||||
|
||||
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
|
||||
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
|
||||
let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
|
||||
if next_gc_cutoff < param_cutoff {
|
||||
next_gc_cutoff = param_cutoff;
|
||||
if next_pitr_cutoff < param_cutoff {
|
||||
next_pitr_cutoff = param_cutoff;
|
||||
}
|
||||
Some(param_cutoff)
|
||||
} else {
|
||||
@@ -263,7 +256,7 @@ pub(super) async fn gather_inputs(
|
||||
.copied()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
|
||||
// next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we
|
||||
// want to query any logical size before initdb_lsn.
|
||||
let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
|
||||
|
||||
@@ -271,14 +264,10 @@ pub(super) async fn gather_inputs(
|
||||
let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
|
||||
.retain_lsns
|
||||
.iter()
|
||||
.filter_map(|(lsn, _child_id, _)| {
|
||||
if lsn > &ancestor_lsn {
|
||||
// this assumes there are no other retain_lsns than the branchpoints
|
||||
Some((*lsn, LsnKind::BranchPoint))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.filter(|&&lsn| lsn > ancestor_lsn)
|
||||
.copied()
|
||||
// this assumes there are no other retain_lsns than the branchpoints
|
||||
.map(|lsn| (lsn, LsnKind::BranchPoint))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
|
||||
@@ -295,10 +284,10 @@ pub(super) async fn gather_inputs(
|
||||
)
|
||||
}
|
||||
|
||||
// Add a point for the GC cutoff
|
||||
let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
|
||||
// Add a point for the PITR cutoff
|
||||
let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
|
||||
if !branch_start_needed {
|
||||
lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
|
||||
lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
|
||||
}
|
||||
|
||||
lsns.sort_unstable();
|
||||
@@ -337,7 +326,7 @@ pub(super) async fn gather_inputs(
|
||||
parent: Some(parent),
|
||||
lsn: lsn.0,
|
||||
size: None,
|
||||
needed: lsn > next_gc_cutoff,
|
||||
needed: lsn > next_pitr_cutoff,
|
||||
},
|
||||
timeline_id: timeline.timeline_id,
|
||||
kind,
|
||||
@@ -361,8 +350,8 @@ pub(super) async fn gather_inputs(
|
||||
segment: Segment {
|
||||
parent: Some(lease_parent),
|
||||
lsn: lsn.0,
|
||||
size: None, // Filled in later, if necessary
|
||||
needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
|
||||
size: None, // Filled in later, if necessary
|
||||
needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention.
|
||||
},
|
||||
timeline_id: timeline.timeline_id,
|
||||
kind: LsnKind::LeaseStart,
|
||||
@@ -402,9 +391,7 @@ pub(super) async fn gather_inputs(
|
||||
last_record: last_record_lsn,
|
||||
// this is not used above, because it might not have updated recently enough
|
||||
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
next_gc_cutoff,
|
||||
next_pitr_cutoff,
|
||||
retention_param_cutoff,
|
||||
lease_points,
|
||||
});
|
||||
@@ -746,9 +733,7 @@ fn verify_size_for_multiple_branches() {
|
||||
"ancestor_lsn": "0/18D3D98",
|
||||
"last_record": "0/2230CD0",
|
||||
"latest_gc_cutoff": "0/1698C48",
|
||||
"horizon_cutoff": "0/2210CD0",
|
||||
"pitr_cutoff": "0/2210CD0",
|
||||
"next_gc_cutoff": "0/2210CD0",
|
||||
"next_pitr_cutoff": "0/2210CD0",
|
||||
"retention_param_cutoff": null,
|
||||
"lease_points": []
|
||||
},
|
||||
@@ -757,9 +742,7 @@ fn verify_size_for_multiple_branches() {
|
||||
"ancestor_lsn": "0/176D998",
|
||||
"last_record": "0/1837770",
|
||||
"latest_gc_cutoff": "0/1698C48",
|
||||
"horizon_cutoff": "0/1817770",
|
||||
"pitr_cutoff": "0/1817770",
|
||||
"next_gc_cutoff": "0/1817770",
|
||||
"next_pitr_cutoff": "0/1817770",
|
||||
"retention_param_cutoff": null,
|
||||
"lease_points": []
|
||||
},
|
||||
@@ -768,9 +751,7 @@ fn verify_size_for_multiple_branches() {
|
||||
"ancestor_lsn": "0/0",
|
||||
"last_record": "0/18D3D98",
|
||||
"latest_gc_cutoff": "0/1698C48",
|
||||
"horizon_cutoff": "0/18B3D98",
|
||||
"pitr_cutoff": "0/18B3D98",
|
||||
"next_gc_cutoff": "0/18B3D98",
|
||||
"next_pitr_cutoff": "0/18B3D98",
|
||||
"retention_param_cutoff": null,
|
||||
"lease_points": []
|
||||
}
|
||||
@@ -824,9 +805,7 @@ fn verify_size_for_one_branch() {
|
||||
"ancestor_lsn": "0/0",
|
||||
"last_record": "47/280A5860",
|
||||
"latest_gc_cutoff": "47/240A5860",
|
||||
"horizon_cutoff": "47/240A5860",
|
||||
"pitr_cutoff": "47/240A5860",
|
||||
"next_gc_cutoff": "47/240A5860",
|
||||
"next_pitr_cutoff": "47/240A5860",
|
||||
"retention_param_cutoff": "0/0",
|
||||
"lease_points": []
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ pub(crate) mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
@@ -457,26 +458,6 @@ pub enum ValueReconstructResult {
|
||||
Missing,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) enum LayerVisibility {
|
||||
/// A Visible layer might be read while serving a read, because there is not an image layer between it
|
||||
/// and a readable LSN (the tip of the branch or a child's branch point)
|
||||
Visible,
|
||||
/// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
|
||||
/// a branch or ephemeral endpoint at an LSN below the layer that covers this.
|
||||
Covered,
|
||||
/// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
|
||||
/// in this state. Note that newly written layers may be called Visible immediately, this uninitialized
|
||||
/// state is for when existing layers are constructed while loading a timeline.
|
||||
Uninitialized,
|
||||
}
|
||||
|
||||
impl Default for LayerVisibility {
|
||||
fn default() -> Self {
|
||||
Self::Uninitialized
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
|
||||
|
||||
@@ -488,7 +469,6 @@ pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
|
||||
struct LayerAccessStatsLocked {
|
||||
for_scraping_api: LayerAccessStatsInner,
|
||||
for_eviction_policy: LayerAccessStatsInner,
|
||||
visibility: LayerVisibility,
|
||||
}
|
||||
|
||||
impl LayerAccessStatsLocked {
|
||||
@@ -612,13 +592,7 @@ impl LayerAccessStats {
|
||||
inner.count_by_access_kind[access_kind] += 1;
|
||||
inner.task_kind_flag |= ctx.task_kind();
|
||||
inner.last_accesses.write(this_access);
|
||||
});
|
||||
|
||||
// We may access a layer marked as Covered, if a new branch was created that depends on
|
||||
// this layer, and background updates to layer visibility didn't notice it yet
|
||||
if !matches!(locked.visibility, LayerVisibility::Visible) {
|
||||
locked.visibility = LayerVisibility::Visible;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn as_api_model(
|
||||
@@ -701,26 +675,24 @@ impl LayerAccessStats {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibility) {
|
||||
self.0.lock().unwrap().visibility = visibility;
|
||||
}
|
||||
/// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
|
||||
///
|
||||
/// This indicates whether the layer has been used for some purpose that would motivate
|
||||
/// us to keep it on disk, such as for serving a getpage request.
|
||||
fn accessed(&self) -> bool {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
|
||||
pub(crate) fn get_visibility(&self) -> LayerVisibility {
|
||||
self.0.lock().unwrap().visibility.clone()
|
||||
}
|
||||
|
||||
/// Summarize how likely this layer is to be used: its access time (if accessed), and its visibility hint.
|
||||
pub(crate) fn atime_visibility(&self) -> (Option<SystemTime>, LayerVisibility) {
|
||||
let state = self.0.lock().unwrap();
|
||||
|
||||
(
|
||||
state
|
||||
.for_eviction_policy
|
||||
.last_accesses
|
||||
.recent()
|
||||
.map(|a| a.when),
|
||||
state.visibility.clone(),
|
||||
)
|
||||
// Consider it accessed if the most recent access is more recent than
|
||||
// the most recent change in residence status.
|
||||
match (
|
||||
inner.last_accesses.recent(),
|
||||
inner.last_residence_changes.recent(),
|
||||
) {
|
||||
(None, _) => false,
|
||||
(Some(_), None) => true,
|
||||
(Some(a), Some(r)) => a.when >= r.timestamp,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -33,11 +33,14 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
@@ -53,6 +56,7 @@ use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
@@ -223,6 +227,11 @@ pub struct DeltaLayerInner {
|
||||
file: VirtualFile,
|
||||
file_id: FileId,
|
||||
|
||||
#[allow(dead_code)]
|
||||
layer_key_range: Range<Key>,
|
||||
#[allow(dead_code)]
|
||||
layer_lsn_range: Range<Lsn>,
|
||||
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
}
|
||||
|
||||
@@ -453,7 +462,7 @@ impl DeltaLayerWriterInner {
|
||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
// We don't want to use compression in delta layer creation
|
||||
let compression = ImageCompressionAlgorithm::DisabledNoDecompress;
|
||||
let compression = ImageCompressionAlgorithm::Disabled;
|
||||
let (val, res) = self
|
||||
.blob_writer
|
||||
.write_blob_maybe_compressed(val, ctx, compression)
|
||||
@@ -742,6 +751,14 @@ impl DeltaLayer {
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
pub(crate) fn key_range(&self) -> &Range<Key> {
|
||||
&self.layer_key_range
|
||||
}
|
||||
|
||||
pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.layer_lsn_range
|
||||
}
|
||||
|
||||
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
||||
/// - inner has the success or transient failure
|
||||
/// - outer has the permanent failure
|
||||
@@ -790,6 +807,8 @@ impl DeltaLayerInner {
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
max_vectored_read_bytes,
|
||||
layer_key_range: actual_summary.key_range,
|
||||
layer_lsn_range: actual_summary.lsn_range,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -1163,9 +1182,7 @@ impl DeltaLayerInner {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let val_ref = ValueRef {
|
||||
blob_ref: BlobRef(value),
|
||||
reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
|
||||
Adapter(self),
|
||||
)),
|
||||
layer: self,
|
||||
};
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
@@ -1304,7 +1321,7 @@ impl DeltaLayerInner {
|
||||
offsets.start.pos(),
|
||||
offsets.end.pos(),
|
||||
meta,
|
||||
Some(max_read_size),
|
||||
max_read_size,
|
||||
))
|
||||
}
|
||||
} else {
|
||||
@@ -1409,7 +1426,7 @@ impl DeltaLayerInner {
|
||||
let keys = self.load_keys(ctx).await?;
|
||||
|
||||
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
|
||||
let buf = val.load_raw(ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
@@ -1444,8 +1461,7 @@ impl DeltaLayerInner {
|
||||
use pageserver_api::key::CHECKPOINT_KEY;
|
||||
use postgres_ffi::CheckPoint;
|
||||
if key == CHECKPOINT_KEY {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let val = val.load(ctx).await?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
let checkpoint = CheckPoint::decode(&img)?;
|
||||
@@ -1498,7 +1514,6 @@ impl DeltaLayerInner {
|
||||
offset
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
@@ -1509,7 +1524,7 @@ impl DeltaLayerInner {
|
||||
index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
|
||||
key_values_batch: std::collections::VecDeque::new(),
|
||||
is_end: false,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
|
||||
planner: StreamingVectoredReadPlanner::new(
|
||||
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
|
||||
1024, // The default value. Unit tests might use a different value
|
||||
),
|
||||
@@ -1530,17 +1545,24 @@ pub struct DeltaEntry<'a> {
|
||||
/// Reference to an on-disk value
|
||||
pub struct ValueRef<'a> {
|
||||
blob_ref: BlobRef,
|
||||
reader: BlockCursor<'a>,
|
||||
layer: &'a DeltaLayerInner,
|
||||
}
|
||||
|
||||
impl<'a> ValueRef<'a> {
|
||||
/// Loads the value from disk
|
||||
pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
|
||||
// theoretically we *could* record an access time for each, but it does not really matter
|
||||
let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
|
||||
let buf = self.load_raw(ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
Ok(val)
|
||||
}
|
||||
|
||||
async fn load_raw(&self, ctx: &RequestContext) -> Result<Vec<u8>> {
|
||||
let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter(
|
||||
self.layer,
|
||||
)));
|
||||
let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct Adapter<T>(T);
|
||||
@@ -1574,17 +1596,15 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub struct DeltaLayerIterator<'a> {
|
||||
delta_layer: &'a DeltaLayerInner,
|
||||
ctx: &'a RequestContext,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
|
||||
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
|
||||
planner: StreamingVectoredReadPlanner,
|
||||
index_iter: DiskBtreeIterator<'a>,
|
||||
key_values_batch: VecDeque<(Key, Lsn, Value)>,
|
||||
is_end: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> DeltaLayerIterator<'a> {
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
@@ -1598,13 +1618,17 @@ impl<'a> DeltaLayerIterator<'a> {
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
|
||||
let blob_ref = BlobRef(value);
|
||||
let offset = blob_ref.pos();
|
||||
if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
|
||||
if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
|
||||
break batch_plan;
|
||||
}
|
||||
} else {
|
||||
self.is_end = true;
|
||||
let data_end_offset = self.delta_layer.index_start_offset();
|
||||
break self.planner.handle_range_end(data_end_offset);
|
||||
if let Some(item) = self.planner.handle_range_end(data_end_offset) {
|
||||
break item;
|
||||
} else {
|
||||
return Ok(()); // TODO: test empty iterator
|
||||
}
|
||||
}
|
||||
};
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
|
||||
@@ -1639,7 +1663,7 @@ impl<'a> DeltaLayerIterator<'a> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
pub(crate) mod test {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use itertools::MinMaxResult;
|
||||
@@ -1647,6 +1671,7 @@ mod test {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::repository::Value;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::Tenant;
|
||||
@@ -1656,6 +1681,7 @@ mod test {
|
||||
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
use bytes::Bytes;
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
/// traverse in order to plan vectored reads for a query. Finally,
|
||||
@@ -1908,7 +1934,7 @@ mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
|
||||
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
@@ -2008,7 +2034,9 @@ mod test {
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
|
||||
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
|
||||
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let ctx = &ctx;
|
||||
let timeline = tenant
|
||||
@@ -2217,15 +2245,31 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
async fn produce_delta_layer(
|
||||
pub(crate) fn sort_delta(
|
||||
(k1, l1, _): &(Key, Lsn, Value),
|
||||
(k2, l2, _): &(Key, Lsn, Value),
|
||||
) -> std::cmp::Ordering {
|
||||
(k1, l1).cmp(&(k2, l2))
|
||||
}
|
||||
|
||||
pub(crate) fn sort_delta_value(
|
||||
(k1, l1, v1): &(Key, Lsn, Value),
|
||||
(k2, l2, v2): &(Key, Lsn, Value),
|
||||
) -> std::cmp::Ordering {
|
||||
let order_1 = if v1.is_image() { 0 } else { 1 };
|
||||
let order_2 = if v2.is_image() { 0 } else { 1 };
|
||||
(k1, l1, order_1).cmp(&(k2, l2, order_2))
|
||||
}
|
||||
|
||||
pub(crate) async fn produce_delta_layer(
|
||||
tenant: &Tenant,
|
||||
tline: &Arc<Timeline>,
|
||||
mut deltas: Vec<(Key, Lsn, Value)>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
|
||||
deltas.sort_by(sort_delta);
|
||||
let (key_start, _, _) = deltas.first().unwrap();
|
||||
let (key_max, _, _) = deltas.first().unwrap();
|
||||
let (key_max, _, _) = deltas.last().unwrap();
|
||||
let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
|
||||
let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
|
||||
let lsn_end = Lsn(lsn_max.0 + 1);
|
||||
@@ -2270,10 +2314,7 @@ mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_layer_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("delta_layer_iterator").unwrap();
|
||||
let harness = TenantHarness::create("delta_layer_iterator").await.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
|
||||
@@ -29,13 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::{
|
||||
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
@@ -50,6 +53,7 @@ use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
@@ -165,7 +169,6 @@ pub struct ImageLayerInner {
|
||||
file_id: FileId,
|
||||
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
compressed_reads: bool,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
@@ -179,8 +182,7 @@ impl std::fmt::Debug for ImageLayerInner {
|
||||
|
||||
impl ImageLayerInner {
|
||||
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
@@ -268,10 +270,9 @@ impl ImageLayer {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded =
|
||||
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, false, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
|
||||
// not production code
|
||||
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
|
||||
@@ -372,6 +373,14 @@ impl ImageLayer {
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
pub(crate) fn key_range(&self) -> &Range<Key> {
|
||||
&self.key_range
|
||||
}
|
||||
|
||||
pub(crate) fn lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
||||
/// - inner has the success or transient failure
|
||||
/// - outer has the permanent failure
|
||||
@@ -380,7 +389,6 @@ impl ImageLayerInner {
|
||||
lsn: Lsn,
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
support_compressed_reads: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path, ctx).await {
|
||||
@@ -424,7 +432,6 @@ impl ImageLayerInner {
|
||||
file,
|
||||
file_id,
|
||||
max_vectored_read_bytes,
|
||||
compressed_reads: support_compressed_reads,
|
||||
key_range: actual_summary.key_range,
|
||||
}))
|
||||
}
|
||||
@@ -435,8 +442,7 @@ impl ImageLayerInner {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
|
||||
@@ -496,14 +502,12 @@ impl ImageLayerInner {
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
let mut result = Vec::new();
|
||||
let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
while let Some(item) = stream.next().await {
|
||||
// TODO: dedup code with get_reconstruct_value
|
||||
@@ -538,8 +542,7 @@ impl ImageLayerInner {
|
||||
.into(),
|
||||
);
|
||||
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
|
||||
@@ -698,19 +701,17 @@ impl ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
|
||||
let block_reader =
|
||||
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
ImageLayerIterator {
|
||||
image_layer: self,
|
||||
ctx,
|
||||
index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
|
||||
key_values_batch: std::collections::VecDeque::new(),
|
||||
key_values_batch: VecDeque::new(),
|
||||
is_end: false,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
|
||||
planner: StreamingVectoredReadPlanner::new(
|
||||
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
|
||||
1024, // The default value. Unit tests might use a different value
|
||||
),
|
||||
@@ -737,6 +738,9 @@ struct ImageLayerWriterInner {
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
// Total uncompressed bytes passed into put_image
|
||||
uncompressed_bytes: u64,
|
||||
|
||||
blob_writer: BlobWriter<false>,
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
}
|
||||
@@ -792,6 +796,7 @@ impl ImageLayerWriterInner {
|
||||
lsn,
|
||||
tree: tree_builder,
|
||||
blob_writer,
|
||||
uncompressed_bytes: 0,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
@@ -809,7 +814,12 @@ impl ImageLayerWriterInner {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
|
||||
let compression = self.conf.image_compression;
|
||||
self.uncompressed_bytes += img.len() as u64;
|
||||
let (_img, res) = self
|
||||
.blob_writer
|
||||
.write_blob_maybe_compressed(img, ctx, compression)
|
||||
.await;
|
||||
// TODO: re-use the buffer for `img` further upstack
|
||||
let off = res?;
|
||||
|
||||
@@ -831,6 +841,11 @@ impl ImageLayerWriterInner {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
// Calculate compression ratio
|
||||
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
|
||||
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
|
||||
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
|
||||
|
||||
let mut file = self.blob_writer.into_inner();
|
||||
|
||||
// Write out the index
|
||||
@@ -970,17 +985,15 @@ impl Drop for ImageLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub struct ImageLayerIterator<'a> {
|
||||
image_layer: &'a ImageLayerInner,
|
||||
ctx: &'a RequestContext,
|
||||
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
|
||||
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
|
||||
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
|
||||
planner: StreamingVectoredReadPlanner,
|
||||
index_iter: DiskBtreeIterator<'a>,
|
||||
key_values_batch: VecDeque<(Key, Lsn, Value)>,
|
||||
is_end: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> ImageLayerIterator<'a> {
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
@@ -994,14 +1007,17 @@ impl<'a> ImageLayerIterator<'a> {
|
||||
Key::from_slice(&raw_key[..KEY_SIZE]),
|
||||
self.image_layer.lsn,
|
||||
offset,
|
||||
BlobFlag::None,
|
||||
) {
|
||||
break batch_plan;
|
||||
}
|
||||
} else {
|
||||
self.is_end = true;
|
||||
let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
break self.planner.handle_range_end(payload_end);
|
||||
if let Some(item) = self.planner.handle_range_end(payload_end) {
|
||||
break item;
|
||||
} else {
|
||||
return Ok(()); // TODO: a test case on empty iterator
|
||||
}
|
||||
}
|
||||
};
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
|
||||
@@ -1095,6 +1111,7 @@ mod test {
|
||||
ShardIdentity::unsharded(),
|
||||
get_next_gen(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
@@ -1161,6 +1178,7 @@ mod test {
|
||||
// But here, all we care about is that the gen number is unique.
|
||||
get_next_gen(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
@@ -1292,7 +1310,7 @@ mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_iterator() {
|
||||
let harness = TenantHarness::create("image_layer_iterator").unwrap();
|
||||
let harness = TenantHarness::create("image_layer_iterator").await.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
|
||||
@@ -715,16 +715,22 @@ impl InMemoryLayer {
|
||||
res?;
|
||||
}
|
||||
}
|
||||
|
||||
// Hold the permit until the IO is done; if we didn't, one could drop this future,
|
||||
// thereby releasing the permit, but the Vec<u8> remains allocated until the IO completes.
|
||||
// => we'd have more concurrenct Vec<u8> than allowed as per the semaphore.
|
||||
drop(_concurrency_permit);
|
||||
}
|
||||
}
|
||||
|
||||
// MAX is used here because we identify L0 layers by full key range
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
|
||||
|
||||
// Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
|
||||
//
|
||||
// If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
|
||||
// the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
|
||||
// Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
|
||||
//
|
||||
// We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
|
||||
// we dirtied when writing to the filesystem have been flushed and marked !dirty.
|
||||
drop(_concurrency_permit);
|
||||
|
||||
Ok(Some(delta_layer))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -250,8 +250,6 @@ impl Layer {
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
// Newly created layers are marked visible by default: the usual case is that they were created to be read.
|
||||
access_stats.set_visibility(super::LayerVisibility::Visible);
|
||||
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
@@ -387,6 +385,7 @@ impl Layer {
|
||||
}
|
||||
|
||||
/// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
@@ -695,6 +694,18 @@ impl Drop for LayerInner {
|
||||
// and we could be delaying shutdown for nothing.
|
||||
}
|
||||
|
||||
if let Some(timeline) = self.timeline.upgrade() {
|
||||
// Only need to decrement metrics if the timeline still exists: otherwise
|
||||
// it will have already de-registered these metrics via TimelineMetrics::shutdown
|
||||
if self.desc.is_delta() {
|
||||
timeline.metrics.layer_count_delta.dec();
|
||||
timeline.metrics.layer_size_delta.sub(self.desc.file_size);
|
||||
} else {
|
||||
timeline.metrics.layer_count_image.dec();
|
||||
timeline.metrics.layer_size_image.sub(self.desc.file_size);
|
||||
}
|
||||
}
|
||||
|
||||
if !*self.wanted_deleted.get_mut() {
|
||||
return;
|
||||
}
|
||||
@@ -793,6 +804,15 @@ impl LayerInner {
|
||||
(heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
|
||||
};
|
||||
|
||||
// This object acts as a RAII guard on these metrics: increment on construction
|
||||
if desc.is_delta() {
|
||||
timeline.metrics.layer_count_delta.inc();
|
||||
timeline.metrics.layer_size_delta.add(desc.file_size);
|
||||
} else {
|
||||
timeline.metrics.layer_count_image.inc();
|
||||
timeline.metrics.layer_size_image.add(desc.file_size);
|
||||
}
|
||||
|
||||
LayerInner {
|
||||
conf,
|
||||
debug_str: {
|
||||
@@ -1471,14 +1491,22 @@ impl LayerInner {
|
||||
let duration = SystemTime::now().duration_since(local_layer_mtime);
|
||||
match duration {
|
||||
Ok(elapsed) => {
|
||||
timeline
|
||||
.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.read()
|
||||
.unwrap()
|
||||
.observe(elapsed);
|
||||
let accessed = self.access_stats.accessed();
|
||||
if accessed {
|
||||
// Only layers used for reads contribute to our "low residence" metric that is used
|
||||
// to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed
|
||||
// to be rapidly evicted without contributing to this metric.
|
||||
timeline
|
||||
.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.read()
|
||||
.unwrap()
|
||||
.observe(elapsed);
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
residence_millis = elapsed.as_millis(),
|
||||
accessed,
|
||||
"evicted layer after known residence period"
|
||||
);
|
||||
}
|
||||
@@ -1687,7 +1715,6 @@ impl DownloadedLayer {
|
||||
lsn,
|
||||
summary,
|
||||
Some(owner.conf.max_vectored_read_bytes),
|
||||
owner.conf.image_compression.allow_decompression(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1892,7 +1919,7 @@ impl ResidentLayer {
|
||||
self.owner.metadata()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Cast the layer to a delta, return an error if it is an image layer.
|
||||
pub(crate) async fn get_as_delta(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
@@ -1904,7 +1931,7 @@ impl ResidentLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Cast the layer to an image, return an error if it is a delta layer.
|
||||
pub(crate) async fn get_as_image(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
|
||||
@@ -22,7 +22,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
|
||||
async fn smoke_test() {
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
|
||||
let h = TenantHarness::create("smoke_test").unwrap();
|
||||
let h = TenantHarness::create("smoke_test").await.unwrap();
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
let (tenant, _) = h.load().await;
|
||||
@@ -176,7 +176,9 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
|
||||
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
|
||||
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted")
|
||||
.await
|
||||
.unwrap();
|
||||
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
@@ -258,7 +260,9 @@ fn read_wins_pending_eviction() {
|
||||
rt.block_on(async move {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
|
||||
let h = TenantHarness::create("read_wins_pending_eviction")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
@@ -390,7 +394,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
rt.block_on(async move {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create(name).unwrap();
|
||||
let h = TenantHarness::create(name).await.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
@@ -559,8 +563,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h =
|
||||
TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
|
||||
let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
@@ -636,7 +641,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn evict_and_wait_does_not_wait_for_download() {
|
||||
// let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
|
||||
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
@@ -733,7 +740,9 @@ async fn eviction_cancellation_on_drop() {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
|
||||
let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
|
||||
let h = TenantHarness::create("eviction_cancellation_on_drop")
|
||||
.await
|
||||
.unwrap();
|
||||
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ pub struct PersistentLayerDesc {
|
||||
///
|
||||
/// - For an open in-memory layer, the end bound is MAX_LSN
|
||||
/// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
|
||||
/// range start
|
||||
/// range start
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
pub lsn_range: Range<Lsn>,
|
||||
/// Whether this is a delta layer, and also, is this incremental.
|
||||
|
||||
561
pageserver/src/tenant/storage_layer/merge_iterator.rs
Normal file
561
pageserver/src/tenant/storage_layer/merge_iterator.rs
Normal file
@@ -0,0 +1,561 @@
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
collections::{binary_heap, BinaryHeap},
|
||||
};
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{context::RequestContext, repository::Value};
|
||||
|
||||
use super::{
|
||||
delta_layer::{DeltaLayerInner, DeltaLayerIterator},
|
||||
image_layer::{ImageLayerInner, ImageLayerIterator},
|
||||
};
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum LayerRef<'a> {
|
||||
Image(&'a ImageLayerInner),
|
||||
Delta(&'a DeltaLayerInner),
|
||||
}
|
||||
|
||||
impl<'a> LayerRef<'a> {
|
||||
fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
|
||||
match self {
|
||||
Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
|
||||
Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum LayerIterRef<'a> {
|
||||
Image(ImageLayerIterator<'a>),
|
||||
Delta(DeltaLayerIterator<'a>),
|
||||
}
|
||||
|
||||
impl LayerIterRef<'_> {
|
||||
async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
match self {
|
||||
Self::Delta(x) => x.next().await,
|
||||
Self::Image(x) => x.next().await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This type plays several roles at once
|
||||
/// 1. Unified iterator for image and delta layers.
|
||||
/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
|
||||
/// 3. Lazy creation of the real delta/image iterator.
|
||||
enum IteratorWrapper<'a> {
|
||||
NotLoaded {
|
||||
ctx: &'a RequestContext,
|
||||
first_key_lower_bound: (Key, Lsn),
|
||||
layer: LayerRef<'a>,
|
||||
},
|
||||
Loaded {
|
||||
iter: PeekableLayerIterRef<'a>,
|
||||
},
|
||||
}
|
||||
|
||||
struct PeekableLayerIterRef<'a> {
|
||||
iter: LayerIterRef<'a>,
|
||||
peeked: Option<(Key, Lsn, Value)>, // None == end
|
||||
}
|
||||
|
||||
impl<'a> PeekableLayerIterRef<'a> {
|
||||
async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result<Self> {
|
||||
let peeked = iter.next().await?;
|
||||
Ok(Self { iter, peeked })
|
||||
}
|
||||
|
||||
fn peek(&self) -> &Option<(Key, Lsn, Value)> {
|
||||
&self.peeked
|
||||
}
|
||||
|
||||
async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
let result = self.peeked.take();
|
||||
self.peeked = self.iter.next().await?;
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
|
||||
|
||||
impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
use std::cmp::Ordering;
|
||||
let a = self.peek_next_key_lsn_value();
|
||||
let b = other.peek_next_key_lsn_value();
|
||||
match (a, b) {
|
||||
(Some((k1, l1, v1)), Some((k2, l2, v2))) => {
|
||||
fn map_value_to_num(val: &Option<&Value>) -> usize {
|
||||
match val {
|
||||
None => 0,
|
||||
Some(Value::Image(_)) => 1,
|
||||
Some(Value::WalRecord(_)) => 2,
|
||||
}
|
||||
}
|
||||
let order_1 = map_value_to_num(&v1);
|
||||
let order_2 = map_value_to_num(&v2);
|
||||
// When key_lsn are the same, the unloaded iter will always appear before the loaded one.
|
||||
// And note that we do a reverse at the end of the comparison, so it works with the max heap.
|
||||
(k1, l1, order_1).cmp(&(k2, l2, order_2))
|
||||
}
|
||||
(Some(_), None) => Ordering::Less,
|
||||
(None, Some(_)) => Ordering::Greater,
|
||||
(None, None) => Ordering::Equal,
|
||||
}
|
||||
.reverse()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> IteratorWrapper<'a> {
|
||||
pub fn create_from_image_layer(
|
||||
image_layer: &'a ImageLayerInner,
|
||||
ctx: &'a RequestContext,
|
||||
) -> Self {
|
||||
Self::NotLoaded {
|
||||
layer: LayerRef::Image(image_layer),
|
||||
first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
|
||||
ctx,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_from_delta_layer(
|
||||
delta_layer: &'a DeltaLayerInner,
|
||||
ctx: &'a RequestContext,
|
||||
) -> Self {
|
||||
Self::NotLoaded {
|
||||
layer: LayerRef::Delta(delta_layer),
|
||||
first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
|
||||
ctx,
|
||||
}
|
||||
}
|
||||
|
||||
fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
|
||||
match self {
|
||||
Self::Loaded { iter } => iter
|
||||
.peek()
|
||||
.as_ref()
|
||||
.map(|(key, lsn, val)| (key, *lsn, Some(val))),
|
||||
Self::NotLoaded {
|
||||
first_key_lower_bound: (key, lsn),
|
||||
..
|
||||
} => Some((key, *lsn, None)),
|
||||
}
|
||||
}
|
||||
|
||||
// CORRECTNESS: this function must always take `&mut self`, never `&self`.
|
||||
//
|
||||
// The reason is that `impl Ord for Self` evaluates differently after this function
|
||||
// returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when
|
||||
// the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut`
|
||||
// and not just `PeekMut::deref`
|
||||
// If we don't take `&mut self`
|
||||
async fn load(&mut self) -> anyhow::Result<()> {
|
||||
assert!(!self.is_loaded());
|
||||
let Self::NotLoaded {
|
||||
ctx,
|
||||
first_key_lower_bound,
|
||||
layer,
|
||||
} = self
|
||||
else {
|
||||
unreachable!()
|
||||
};
|
||||
let iter = layer.iter(ctx);
|
||||
let iter = PeekableLayerIterRef::create(iter).await?;
|
||||
if let Some((k1, l1, _)) = iter.peek() {
|
||||
let (k2, l2) = first_key_lower_bound;
|
||||
debug_assert!((k1, l1) >= (k2, l2));
|
||||
}
|
||||
*self = Self::Loaded { iter };
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_loaded(&self) -> bool {
|
||||
matches!(self, Self::Loaded { .. })
|
||||
}
|
||||
|
||||
/// Correctness: must load the iterator before using.
|
||||
///
|
||||
/// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it.
|
||||
/// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
|
||||
/// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
|
||||
async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
let Self::Loaded { iter } = self else {
|
||||
panic!("must load the iterator before using")
|
||||
};
|
||||
iter.next().await
|
||||
}
|
||||
}
|
||||
|
||||
/// A merge iterator over delta/image layer iterators. When duplicated records are
|
||||
/// found, the iterator will not perform any deduplication, and the caller should handle
|
||||
/// these situation. By saying duplicated records, there are many possibilities:
|
||||
/// * Two same delta at the same LSN.
|
||||
/// * Two same image at the same LSN.
|
||||
/// * Delta/image at the same LSN where the image has already applied the delta.
|
||||
/// The iterator will always put the image before the delta.
|
||||
pub struct MergeIterator<'a> {
|
||||
heap: BinaryHeap<IteratorWrapper<'a>>,
|
||||
}
|
||||
|
||||
impl<'a> MergeIterator<'a> {
|
||||
pub fn create(
|
||||
deltas: &[&'a DeltaLayerInner],
|
||||
images: &[&'a ImageLayerInner],
|
||||
ctx: &'a RequestContext,
|
||||
) -> Self {
|
||||
let mut heap = Vec::with_capacity(images.len() + deltas.len());
|
||||
for image in images {
|
||||
heap.push(IteratorWrapper::create_from_image_layer(image, ctx));
|
||||
}
|
||||
for delta in deltas {
|
||||
heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx));
|
||||
}
|
||||
Self {
|
||||
heap: BinaryHeap::from(heap),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
while let Some(mut iter) = self.heap.peek_mut() {
|
||||
if !iter.is_loaded() {
|
||||
// Once we load the iterator, we can know the real first key-value pair in the iterator.
|
||||
// We put it back into the heap so that a potentially unloaded layer may have a key between
|
||||
// [potential_first_key, loaded_first_key).
|
||||
iter.load().await?;
|
||||
continue;
|
||||
}
|
||||
let Some(item) = iter.next().await? else {
|
||||
// If the iterator returns None, we pop this iterator. Actually, in the current implementation,
|
||||
// we order None > Some, and all the rest of the iterators should return None.
|
||||
binary_heap::PeekMut::pop(iter);
|
||||
continue;
|
||||
};
|
||||
return Ok(Some(item));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
|
||||
},
|
||||
walrecord::NeonWalRecord,
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
async fn assert_merge_iter_equal(
|
||||
merge_iter: &mut MergeIterator<'_>,
|
||||
expect: &[(Key, Lsn, Value)],
|
||||
) {
|
||||
let mut expect_iter = expect.iter();
|
||||
loop {
|
||||
let o1 = merge_iter.next().await.unwrap();
|
||||
let o2 = expect_iter.next();
|
||||
assert_eq!(o1.is_some(), o2.is_some());
|
||||
if o1.is_none() && o2.is_none() {
|
||||
break;
|
||||
}
|
||||
let (k1, l1, v1) = o1.unwrap();
|
||||
let (k2, l2, v2) = o2.unwrap();
|
||||
assert_eq!(&k1, k2);
|
||||
assert_eq!(l1, *l2);
|
||||
assert_eq!(&v1, v2);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn merge_in_between() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_merge_in_between")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
let test_deltas1 = vec![
|
||||
(
|
||||
get_key(0),
|
||||
Lsn(0x10),
|
||||
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x10),
|
||||
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||
),
|
||||
];
|
||||
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let test_deltas2 = vec![
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x10),
|
||||
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||
),
|
||||
(
|
||||
get_key(4),
|
||||
Lsn(0x10),
|
||||
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||
),
|
||||
];
|
||||
let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut merge_iter = MergeIterator::create(
|
||||
&[
|
||||
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||
],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
let mut expect = Vec::new();
|
||||
expect.extend(test_deltas1);
|
||||
expect.extend(test_deltas2);
|
||||
expect.sort_by(sort_delta);
|
||||
assert_merge_iter_equal(&mut merge_iter, &expect).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_merge")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
const N: usize = 1000;
|
||||
let test_deltas1 = (0..N)
|
||||
.map(|idx| {
|
||||
(
|
||||
get_key(idx as u32 / 10),
|
||||
Lsn(0x20 * ((idx as u64) % 10 + 1)),
|
||||
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||
)
|
||||
})
|
||||
.collect_vec();
|
||||
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let test_deltas2 = (0..N)
|
||||
.map(|idx| {
|
||||
(
|
||||
get_key(idx as u32 / 10),
|
||||
Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10),
|
||||
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||
)
|
||||
})
|
||||
.collect_vec();
|
||||
let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let test_deltas3 = (0..N)
|
||||
.map(|idx| {
|
||||
(
|
||||
get_key(idx as u32 / 10 + N as u32),
|
||||
Lsn(0x10 * ((idx as u64) % 10 + 1)),
|
||||
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||
)
|
||||
})
|
||||
.collect_vec();
|
||||
let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut merge_iter = MergeIterator::create(
|
||||
&[
|
||||
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_3.get_as_delta(&ctx).await.unwrap(),
|
||||
],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
let mut expect = Vec::new();
|
||||
expect.extend(test_deltas1);
|
||||
expect.extend(test_deltas2);
|
||||
expect.extend(test_deltas3);
|
||||
expect.sort_by(sort_delta);
|
||||
assert_merge_iter_equal(&mut merge_iter, &expect).await;
|
||||
|
||||
// TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_image_mixed_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
// In this test case, we want to test if the iterator still works correctly with multiple copies
|
||||
// of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab.
|
||||
// Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix.
|
||||
// An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation
|
||||
// could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation
|
||||
// one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should
|
||||
// correctly process these situations and return everything as-is, and the upper layer of the system
|
||||
// will handle duplicated LSNs.
|
||||
let test_deltas1 = vec![
|
||||
(
|
||||
get_key(0),
|
||||
Lsn(0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_init()),
|
||||
),
|
||||
(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("a")),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_init()),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x18),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("b")),
|
||||
),
|
||||
];
|
||||
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut test_deltas2 = test_deltas1.clone();
|
||||
test_deltas2.push((
|
||||
get_key(10),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||
));
|
||||
let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let test_deltas3 = vec![
|
||||
(
|
||||
get_key(0),
|
||||
Lsn(0x10),
|
||||
Value::Image(Bytes::copy_from_slice(b"")),
|
||||
),
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x18),
|
||||
Value::Image(Bytes::copy_from_slice(b"b")),
|
||||
),
|
||||
(
|
||||
get_key(15),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||
),
|
||||
];
|
||||
let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut test_deltas4 = test_deltas3.clone();
|
||||
test_deltas4.push((
|
||||
get_key(20),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||
));
|
||||
let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut expect = Vec::new();
|
||||
expect.extend(test_deltas1);
|
||||
expect.extend(test_deltas2);
|
||||
expect.extend(test_deltas3);
|
||||
expect.extend(test_deltas4);
|
||||
expect.sort_by(sort_delta_value);
|
||||
|
||||
// Test with different layer order for MergeIterator::create to ensure the order
|
||||
// is stable.
|
||||
|
||||
let mut merge_iter = MergeIterator::create(
|
||||
&[
|
||||
resident_layer_4.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_3.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
|
||||
],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
assert_merge_iter_equal(&mut merge_iter, &expect).await;
|
||||
|
||||
let mut merge_iter = MergeIterator::create(
|
||||
&[
|
||||
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_4.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_3.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
|
||||
],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
assert_merge_iter_equal(&mut merge_iter, &expect).await;
|
||||
|
||||
is_send(merge_iter);
|
||||
}
|
||||
|
||||
fn is_send(_: impl Send) {}
|
||||
}
|
||||
@@ -30,7 +30,7 @@ use pageserver_api::{
|
||||
InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
|
||||
},
|
||||
reltag::BlockNumber,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId},
|
||||
shard::{ShardIdentity, ShardNumber, TenantShardId},
|
||||
};
|
||||
use rand::Rng;
|
||||
use serde_with::serde_as;
|
||||
@@ -66,12 +66,13 @@ use std::{
|
||||
ops::{Deref, Range},
|
||||
};
|
||||
|
||||
use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
tenant::{
|
||||
config::defaults::DEFAULT_PITR_INTERVAL,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::TimelineMetadata,
|
||||
storage_layer::PersistentLayerDesc,
|
||||
},
|
||||
};
|
||||
use crate::{
|
||||
@@ -98,6 +99,7 @@ use crate::{
|
||||
metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
|
||||
};
|
||||
use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
|
||||
use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
|
||||
use crate::{
|
||||
pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
@@ -135,7 +137,7 @@ use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::{config::TenantConf, storage_layer::LayerVisibility};
|
||||
use super::config::TenantConf;
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
|
||||
@@ -196,7 +198,7 @@ impl PartialOrd for Hole {
|
||||
|
||||
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
||||
/// Can be removed after all refactors are done.
|
||||
fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
|
||||
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
|
||||
drop(rlock)
|
||||
}
|
||||
|
||||
@@ -269,7 +271,7 @@ pub struct Timeline {
|
||||
///
|
||||
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
|
||||
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
|
||||
pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
|
||||
pub(crate) layers: tokio::sync::RwLock<LayerManager>,
|
||||
|
||||
last_freeze_at: AtomicLsn,
|
||||
// Atomic would be more appropriate here.
|
||||
@@ -453,12 +455,12 @@ pub struct WalReceiverInfo {
|
||||
/// Garbage Collection.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct GcInfo {
|
||||
/// Record which parts of this timeline's history are still needed by children
|
||||
/// Specific LSNs that are needed.
|
||||
///
|
||||
/// Optionally store each child's keyspace at their branch LSN: parts of the keyspace not covered here may be dropped during GC, as
|
||||
/// the child will never read them. For example, a child which has covered its whole keyspace with image layers
|
||||
/// will put an empty keyspace here. Children populate this: if it is None, presume the child may read any part of the keyspace.
|
||||
pub(crate) retain_lsns: Vec<(Lsn, TimelineId, Option<KeySpace>)>,
|
||||
/// Currently, this includes all points where child branches have
|
||||
/// been forked off from. In the future, could also include
|
||||
/// explicit user-defined snapshot points.
|
||||
pub(crate) retain_lsns: Vec<Lsn>,
|
||||
|
||||
/// The cutoff coordinates, which are combined by selecting the minimum.
|
||||
pub(crate) cutoffs: GcCutoffs,
|
||||
@@ -474,56 +476,34 @@ impl GcInfo {
|
||||
pub(crate) fn min_cutoff(&self) -> Lsn {
|
||||
self.cutoffs.select_min()
|
||||
}
|
||||
|
||||
pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) {
|
||||
self.retain_lsns.push((child_lsn, child_id, None));
|
||||
self.retain_lsns.sort_by_key(|i| i.0);
|
||||
}
|
||||
|
||||
pub(super) fn remove_child(&mut self, child_id: TimelineId) {
|
||||
self.retain_lsns.retain(|i| i.1 != child_id);
|
||||
}
|
||||
|
||||
/// When the child re-calculates which parts of the keyspace it will read from the ancestor, it posts
|
||||
/// and update to the parent using this function, to enable the parent to perhaps GC more layers.
|
||||
pub(super) fn notify_child_keyspace(&mut self, child_id: TimelineId, key_space: KeySpace) {
|
||||
if let Ok(idx) = self.retain_lsns.binary_search_by_key(&child_id, |i| i.1) {
|
||||
self.retain_lsns.get_mut(idx).unwrap().2 = Some(key_space);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The `GcInfo` component describing which Lsns need to be retained.
|
||||
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
|
||||
/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
|
||||
/// between time-based and space-based retention for observability and consumption metrics purposes.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct GcCutoffs {
|
||||
/// Keep everything newer than this point.
|
||||
///
|
||||
/// This is calculated by subtracting 'gc_horizon' setting from
|
||||
/// last-record LSN
|
||||
///
|
||||
/// FIXME: is this inclusive or exclusive?
|
||||
pub(crate) horizon: Lsn,
|
||||
/// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
|
||||
/// history we must keep to retain a specified number of bytes of WAL.
|
||||
pub(crate) space: Lsn,
|
||||
|
||||
/// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
|
||||
/// point.
|
||||
///
|
||||
/// This is calculated by finding a number such that a record is needed for PITR
|
||||
/// if only if its LSN is larger than 'pitr_cutoff'.
|
||||
pub(crate) pitr: Lsn,
|
||||
/// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much
|
||||
/// history we must keep to enable reading back at least the PITR interval duration.
|
||||
pub(crate) time: Lsn,
|
||||
}
|
||||
|
||||
impl Default for GcCutoffs {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
horizon: Lsn::INVALID,
|
||||
pitr: Lsn::INVALID,
|
||||
space: Lsn::INVALID,
|
||||
time: Lsn::INVALID,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GcCutoffs {
|
||||
fn select_min(&self) -> Lsn {
|
||||
std::cmp::min(self.horizon, self.pitr)
|
||||
std::cmp::min(self.space, self.time)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -745,6 +725,9 @@ impl From<CreateImageLayersError> for CompactionError {
|
||||
fn from(e: CreateImageLayersError) -> Self {
|
||||
match e {
|
||||
CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
|
||||
CreateImageLayersError::Other(e) => {
|
||||
CompactionError::Other(e.context("create image layers"))
|
||||
}
|
||||
_ => CompactionError::Other(e.into()),
|
||||
}
|
||||
}
|
||||
@@ -879,7 +862,7 @@ impl Timeline {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let history = self
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(gc_info.cutoffs.pitr)
|
||||
.checked_sub(gc_info.cutoffs.time)
|
||||
.unwrap_or(Lsn(0))
|
||||
.0;
|
||||
(history, gc_info.within_ancestor_pitr)
|
||||
@@ -1578,7 +1561,7 @@ impl Timeline {
|
||||
) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
lsn >= **latest_gc_cutoff_lsn,
|
||||
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
|
||||
"LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)",
|
||||
lsn,
|
||||
**latest_gc_cutoff_lsn,
|
||||
);
|
||||
@@ -1810,26 +1793,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
match self.get_compaction_algorithm_settings().kind {
|
||||
CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await?,
|
||||
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await?,
|
||||
CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
|
||||
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
|
||||
}
|
||||
|
||||
if self.shard_identity.count >= ShardCount::new(2) {
|
||||
// Limit the number of layer rewrites to the number of partitions: this means its
|
||||
// runtime should be comparable to a full round of image layer creations, rather than
|
||||
// being potentially much longer.
|
||||
// TODO: make `partitioning` a sync lock: see comment in `repartition()` for why there's no
|
||||
// real async use.
|
||||
let rewrite_max = self.partitioning.try_lock().unwrap().0 .0.parts.len();
|
||||
|
||||
self.compact_shard_ancestors(rewrite_max, ctx).await?;
|
||||
}
|
||||
|
||||
// TODO: be more selective: call this once at startup, and thereafter only when some branching changes or
|
||||
// when image layer are generated.
|
||||
self.update_layer_visibility(ctx).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Mutate the timeline with a [`TimelineWriter`].
|
||||
@@ -3001,17 +2967,6 @@ impl Timeline {
|
||||
.set((calculated_size, metrics_guard.calculation_result_saved()))
|
||||
.ok()
|
||||
.expect("only this task sets it");
|
||||
|
||||
// As a nice-to-have, calculate layer visibilties. Otherwise this will
|
||||
// be initialized on first compaction. Doing it as early as possible
|
||||
// enables code that depends on layer visibility (like uploading heatmaps)
|
||||
// to execute earlier, rather than waiting for compaction.
|
||||
match self.update_layer_visibility(&background_ctx).await {
|
||||
Ok(_) | Err(CompactionError::ShuttingDown) => {}
|
||||
Err(e) => {
|
||||
tracing::warn!("Initial layer visibility calculation failed: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn spawn_ondemand_logical_size_calculation(
|
||||
@@ -3189,8 +3144,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// The timeline heatmap is a hint to secondary locations from the primary location,
|
||||
/// indicating which layers should be downloaded on the secondary to give it a warm
|
||||
/// cache, that will enable it to take over as the attached location without degrading performance.
|
||||
/// indicating which layers are currently on-disk on the primary.
|
||||
///
|
||||
/// None is returned if the Timeline is in a state where uploading a heatmap
|
||||
/// doesn't make sense, such as shutting down or initializing. The caller
|
||||
@@ -3203,32 +3157,19 @@ impl Timeline {
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
|
||||
let mut resident_visible_layers = Vec::new();
|
||||
let now = SystemTime::now();
|
||||
for layer in guard.likely_resident_layers() {
|
||||
let (atime, visibility) = layer.access_stats().atime_visibility();
|
||||
let resident = guard.likely_resident_layers().map(|layer| {
|
||||
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||
|
||||
match visibility {
|
||||
LayerVisibility::Uninitialized => {
|
||||
// Refuse to generate a heatmap at all until layer visibilty is initialized
|
||||
return None;
|
||||
}
|
||||
LayerVisibility::Covered => {
|
||||
// This layer is covered: exclude it from the heatmap because a secondary
|
||||
// node is highly unlikely to need this layer in the event that it takes over as attached
|
||||
}
|
||||
LayerVisibility::Visible => resident_visible_layers.push(HeatMapLayer::new(
|
||||
layer.layer_desc().layer_name(),
|
||||
layer.metadata(),
|
||||
atime.unwrap_or(now),
|
||||
)),
|
||||
}
|
||||
}
|
||||
HeatMapLayer::new(
|
||||
layer.layer_desc().layer_name(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
)
|
||||
});
|
||||
|
||||
Some(HeatMapTimeline::new(
|
||||
self.timeline_id,
|
||||
resident_visible_layers,
|
||||
))
|
||||
let layers = resident.collect();
|
||||
|
||||
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||
}
|
||||
|
||||
/// Returns true if the given lsn is or was an ancestor branchpoint.
|
||||
@@ -3463,6 +3404,8 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
|
||||
#[allow(clippy::doc_lazy_continuation)]
|
||||
/// Get the data needed to reconstruct all keys in the provided keyspace
|
||||
///
|
||||
/// The algorithm is as follows:
|
||||
@@ -4529,10 +4472,10 @@ impl Timeline {
|
||||
/// are required. Since checking if new image layers are required is expensive in
|
||||
/// terms of CPU, we only do it in the following cases:
|
||||
/// 1. If the timeline has ingested sufficient WAL to justify the cost
|
||||
/// 2. If enough time has passed since the last check
|
||||
/// 2.1. For large tenants, we wish to perform the check more often since they
|
||||
/// suffer from the lack of image layers
|
||||
/// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
|
||||
/// 2. If enough time has passed since the last check:
|
||||
/// 1. For large tenants, we wish to perform the check more often since they
|
||||
/// suffer from the lack of image layers
|
||||
/// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval
|
||||
fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
|
||||
const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
|
||||
|
||||
@@ -4628,6 +4571,22 @@ impl Timeline {
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
} else if let ImageLayerCreationMode::Force = mode {
|
||||
// When forced to create image layers, we might try and create them where they already
|
||||
// exist. This mode is only used in tests/debug.
|
||||
let layers = self.layers.read().await;
|
||||
if layers.contains_key(&PersistentLayerKey {
|
||||
key_range: img_range.clone(),
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
|
||||
is_delta: false,
|
||||
}) {
|
||||
tracing::info!(
|
||||
"Skipping image layer at {lsn} {}..{}, already exists",
|
||||
img_range.start,
|
||||
img_range.end
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let image_layer_writer = ImageLayerWriter::new(
|
||||
@@ -4758,7 +4717,7 @@ impl Timeline {
|
||||
/// Requires a timeline that:
|
||||
/// - has an ancestor to detach from
|
||||
/// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
|
||||
/// a technical requirement
|
||||
/// a technical requirement
|
||||
///
|
||||
/// After the operation has been started, it cannot be canceled. Upon restart it needs to be
|
||||
/// polled again until completion.
|
||||
@@ -4770,13 +4729,7 @@ impl Timeline {
|
||||
tenant: &crate::tenant::Tenant,
|
||||
options: detach_ancestor::Options,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<
|
||||
(
|
||||
completion::Completion,
|
||||
detach_ancestor::PreparedTimelineDetach,
|
||||
),
|
||||
detach_ancestor::Error,
|
||||
> {
|
||||
) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
|
||||
detach_ancestor::prepare(self, tenant, options, ctx).await
|
||||
}
|
||||
|
||||
@@ -4983,24 +4936,21 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Find the Lsns above which layer files need to be retained on
|
||||
/// garbage collection. This is separate from actually performing the GC,
|
||||
/// and is updated more frequently, so that compaction can remove obsolete
|
||||
/// page versions more aggressively.
|
||||
/// garbage collection.
|
||||
///
|
||||
/// TODO: that's wishful thinking, compaction doesn't actually do that
|
||||
/// currently.
|
||||
/// We calculate two cutoffs, one based on time and one based on WAL size. `pitr`
|
||||
/// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls
|
||||
/// the space-based retention.
|
||||
///
|
||||
/// The 'cutoff_horizon' point is used to retain recent versions that might still be
|
||||
/// needed by read-only nodes. (As of this writing, the caller just passes
|
||||
/// the latest LSN subtracted by a constant, and doesn't do anything smart
|
||||
/// to figure out what read-only nodes might actually need.)
|
||||
///
|
||||
/// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
|
||||
/// whether a record is needed for PITR.
|
||||
/// This function doesn't simply to calculate time & space based retention: it treats time-based
|
||||
/// retention as authoritative if enabled, and falls back to space-based retention if calculating
|
||||
/// the LSN for a time point isn't possible. Therefore the GcCutoffs::horizon in the response might
|
||||
/// be different to the `space_cutoff` input. Callers should treat the min() of the two cutoffs
|
||||
/// in the response as the GC cutoff point for the timeline.
|
||||
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
|
||||
pub(super) async fn find_gc_cutoffs(
|
||||
&self,
|
||||
cutoff_horizon: Lsn,
|
||||
space_cutoff: Lsn,
|
||||
pitr: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
@@ -5013,58 +4963,87 @@ impl Timeline {
|
||||
|
||||
pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
|
||||
|
||||
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
|
||||
//
|
||||
// Some unit tests depend on garbage-collection working even when
|
||||
// CLOG data is missing, so that find_lsn_for_timestamp() doesn't
|
||||
// work, so avoid calling it altogether if time-based retention is not
|
||||
// configured. It would be pointless anyway.
|
||||
let pitr_cutoff = if pitr != Duration::ZERO {
|
||||
let now = SystemTime::now();
|
||||
if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
|
||||
let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
|
||||
|
||||
match self
|
||||
.find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
|
||||
.await?
|
||||
{
|
||||
LsnForTimestamp::Present(lsn) => lsn,
|
||||
LsnForTimestamp::Future(lsn) => {
|
||||
// The timestamp is in the future. That sounds impossible,
|
||||
// but what it really means is that there hasn't been
|
||||
// any commits since the cutoff timestamp.
|
||||
//
|
||||
// In this case we should use the LSN of the most recent commit,
|
||||
// which is implicitly the last LSN in the log.
|
||||
debug!("future({})", lsn);
|
||||
self.get_last_record_lsn()
|
||||
}
|
||||
LsnForTimestamp::Past(lsn) => {
|
||||
debug!("past({})", lsn);
|
||||
// conservative, safe default is to remove nothing, when we
|
||||
// have no commit timestamp data available
|
||||
*self.get_latest_gc_cutoff_lsn()
|
||||
}
|
||||
LsnForTimestamp::NoData(lsn) => {
|
||||
debug!("nodata({})", lsn);
|
||||
// conservative, safe default is to remove nothing, when we
|
||||
// have no commit timestamp data available
|
||||
*self.get_latest_gc_cutoff_lsn()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If we don't have enough data to convert to LSN,
|
||||
// play safe and don't remove any layers.
|
||||
*self.get_latest_gc_cutoff_lsn()
|
||||
if cfg!(test) {
|
||||
// Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
|
||||
if pitr == Duration::ZERO {
|
||||
return Ok(GcCutoffs {
|
||||
time: self.get_last_record_lsn(),
|
||||
space: space_cutoff,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate a time-based limit on how much to retain:
|
||||
// - if PITR interval is set, then this is our cutoff.
|
||||
// - if PITR interval is not set, then we do a lookup
|
||||
// based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
|
||||
let time_cutoff = {
|
||||
let now = SystemTime::now();
|
||||
let time_range = if pitr == Duration::ZERO {
|
||||
humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
|
||||
} else {
|
||||
pitr
|
||||
};
|
||||
|
||||
// If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
|
||||
let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
|
||||
let timestamp = to_pg_timestamp(time_cutoff);
|
||||
|
||||
match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
|
||||
LsnForTimestamp::Present(lsn) => Some(lsn),
|
||||
LsnForTimestamp::Future(lsn) => {
|
||||
// The timestamp is in the future. That sounds impossible,
|
||||
// but what it really means is that there hasn't been
|
||||
// any commits since the cutoff timestamp.
|
||||
//
|
||||
// In this case we should use the LSN of the most recent commit,
|
||||
// which is implicitly the last LSN in the log.
|
||||
debug!("future({})", lsn);
|
||||
Some(self.get_last_record_lsn())
|
||||
}
|
||||
LsnForTimestamp::Past(lsn) => {
|
||||
debug!("past({})", lsn);
|
||||
None
|
||||
}
|
||||
LsnForTimestamp::NoData(lsn) => {
|
||||
debug!("nodata({})", lsn);
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No time-based retention was configured. Interpret this as "keep no history".
|
||||
self.get_last_record_lsn()
|
||||
};
|
||||
|
||||
Ok(GcCutoffs {
|
||||
horizon: cutoff_horizon,
|
||||
pitr: pitr_cutoff,
|
||||
Ok(match (pitr, time_cutoff) {
|
||||
(Duration::ZERO, Some(time_cutoff)) => {
|
||||
// PITR is not set. Retain the size-based limit, or the default time retention,
|
||||
// whichever requires less data.
|
||||
GcCutoffs {
|
||||
time: self.get_last_record_lsn(),
|
||||
space: std::cmp::max(time_cutoff, space_cutoff),
|
||||
}
|
||||
}
|
||||
(Duration::ZERO, None) => {
|
||||
// PITR is not set, and time lookup failed
|
||||
GcCutoffs {
|
||||
time: self.get_last_record_lsn(),
|
||||
space: space_cutoff,
|
||||
}
|
||||
}
|
||||
(_, None) => {
|
||||
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
|
||||
// cannot advance beyond what was already GC'd, and respect space-based retention
|
||||
GcCutoffs {
|
||||
time: *self.get_latest_gc_cutoff_lsn(),
|
||||
space: space_cutoff,
|
||||
}
|
||||
}
|
||||
(_, Some(time_cutoff)) => {
|
||||
// PITR interval is set and we looked up timestamp successfully. Ignore
|
||||
// size based retention and make time cutoff authoritative
|
||||
GcCutoffs {
|
||||
time: time_cutoff,
|
||||
space: time_cutoff,
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -5089,16 +5068,12 @@ impl Timeline {
|
||||
return Err(GcError::TimelineCancelled);
|
||||
}
|
||||
|
||||
let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
|
||||
let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
|
||||
let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
|
||||
let pitr_cutoff = gc_info.cutoffs.pitr;
|
||||
let retain_lsns = gc_info
|
||||
.retain_lsns
|
||||
.iter()
|
||||
.map(|(lsn, _child_id, _)| *lsn)
|
||||
.collect();
|
||||
let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
|
||||
let time_cutoff = gc_info.cutoffs.time;
|
||||
let retain_lsns = gc_info.retain_lsns.clone();
|
||||
|
||||
// Gets the maximum LSN that holds the valid lease.
|
||||
//
|
||||
@@ -5107,14 +5082,14 @@ impl Timeline {
|
||||
let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);
|
||||
|
||||
(
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
space_cutoff,
|
||||
time_cutoff,
|
||||
retain_lsns,
|
||||
max_lsn_with_valid_lease,
|
||||
)
|
||||
};
|
||||
|
||||
let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
|
||||
let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
|
||||
let standby_horizon = self.standby_horizon.load();
|
||||
// Hold GC for the standby, but as a safety guard do it only within some
|
||||
// reasonable lag.
|
||||
@@ -5143,8 +5118,8 @@ impl Timeline {
|
||||
|
||||
let res = self
|
||||
.gc_timeline(
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
space_cutoff,
|
||||
time_cutoff,
|
||||
retain_lsns,
|
||||
max_lsn_with_valid_lease,
|
||||
new_gc_cutoff,
|
||||
@@ -5162,8 +5137,8 @@ impl Timeline {
|
||||
|
||||
async fn gc_timeline(
|
||||
&self,
|
||||
horizon_cutoff: Lsn,
|
||||
pitr_cutoff: Lsn,
|
||||
space_cutoff: Lsn,
|
||||
time_cutoff: Lsn,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
max_lsn_with_valid_lease: Option<Lsn>,
|
||||
new_gc_cutoff: Lsn,
|
||||
@@ -5224,22 +5199,22 @@ impl Timeline {
|
||||
result.layers_total += 1;
|
||||
|
||||
// 1. Is it newer than GC horizon cutoff point?
|
||||
if l.get_lsn_range().end > horizon_cutoff {
|
||||
if l.get_lsn_range().end > space_cutoff {
|
||||
debug!(
|
||||
"keeping {} because it's newer than horizon_cutoff {}",
|
||||
"keeping {} because it's newer than space_cutoff {}",
|
||||
l.layer_name(),
|
||||
horizon_cutoff,
|
||||
space_cutoff,
|
||||
);
|
||||
result.layers_needed_by_cutoff += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
// 2. It is newer than PiTR cutoff point?
|
||||
if l.get_lsn_range().end > pitr_cutoff {
|
||||
if l.get_lsn_range().end > time_cutoff {
|
||||
debug!(
|
||||
"keeping {} because it's newer than pitr_cutoff {}",
|
||||
"keeping {} because it's newer than time_cutoff {}",
|
||||
l.layer_name(),
|
||||
pitr_cutoff,
|
||||
time_cutoff,
|
||||
);
|
||||
result.layers_needed_by_pitr += 1;
|
||||
continue 'outer;
|
||||
@@ -6071,8 +6046,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn two_layer_eviction_attempts_at_the_same_time() {
|
||||
let harness =
|
||||
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
|
||||
let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
|
||||
@@ -19,16 +19,18 @@ use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::ShardedRange;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, LayerVisibility, PersistentLayerDesc};
|
||||
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
|
||||
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
use crate::tenant::DeltaLayer;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
@@ -100,7 +102,7 @@ impl Timeline {
|
||||
// Define partitioning schema if needed
|
||||
|
||||
// FIXME: the match should only cover repartitioning, not the next steps
|
||||
match self
|
||||
let partition_count = match self
|
||||
.repartition(
|
||||
self.get_last_record_lsn(),
|
||||
self.get_compaction_target_size(),
|
||||
@@ -140,6 +142,7 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
partitioning.parts.len()
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -151,9 +154,19 @@ impl Timeline {
|
||||
if !self.cancel.is_cancelled() {
|
||||
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
}
|
||||
1
|
||||
}
|
||||
};
|
||||
|
||||
if self.shard_identity.count >= ShardCount::new(2) {
|
||||
// Limit the number of layer rewrites to the number of partitions: this means its
|
||||
// runtime should be comparable to a full round of image layer creations, rather than
|
||||
// being potentially much longer.
|
||||
let rewrite_max = partition_count;
|
||||
|
||||
self.compact_shard_ancestors(rewrite_max, ctx).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -165,7 +178,7 @@ impl Timeline {
|
||||
///
|
||||
/// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound
|
||||
/// how much work it will try to do in each compaction pass.
|
||||
pub(super) async fn compact_shard_ancestors(
|
||||
async fn compact_shard_ancestors(
|
||||
self: &Arc<Self>,
|
||||
rewrite_max: usize,
|
||||
ctx: &RequestContext,
|
||||
@@ -184,7 +197,7 @@ impl Timeline {
|
||||
tracing::info!(
|
||||
"latest_gc_cutoff: {}, pitr cutoff {}",
|
||||
*latest_gc_cutoff,
|
||||
self.gc_info.read().unwrap().cutoffs.pitr
|
||||
self.gc_info.read().unwrap().cutoffs.time
|
||||
);
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
@@ -347,88 +360,6 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// A post-compaction step to update the LayerVisibility of layers covered by image layers. This
|
||||
/// should also be called when new branches are created.
|
||||
///
|
||||
/// Sweep through the layer map, identifying layers which are covered by image layers
|
||||
/// such that they do not need to be available to service reads. The resulting LayerVisibility
|
||||
/// result may be used as an input to eviction and secondary downloads to de-prioritize layers
|
||||
/// that we know won't be needed for reads.
|
||||
pub(super) async fn update_layer_visibility(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
// Start with a keyspace representing all the keys we need to read from the tip of the branch
|
||||
let head_lsn = self.get_last_record_lsn();
|
||||
let (mut head_keyspace, sparse_ks) = self.collect_keyspace(head_lsn, ctx).await?;
|
||||
|
||||
// Converting the sparse part of the keyspace into the dense keyspace is safe in this context
|
||||
// because we will never iterate through the keys.
|
||||
head_keyspace.merge(&sparse_ks.0);
|
||||
|
||||
// We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas
|
||||
// are implicitly visible, because LayerVisibility's default is Visible, and we never modify it here.
|
||||
let layer_manager = self.layers.read().await;
|
||||
let layer_map = layer_manager.layer_map();
|
||||
|
||||
let mut visible_size: u64 = 0;
|
||||
|
||||
// FIXME: we only get accurate keyspaces from children if they've already run update_layer_visibility themselves. At startup all the timelines
|
||||
// initialize this in arbitrary order (at the end of initial_logical_size_calculation). We should coordinate these. Perhaps at the very start
|
||||
// of the tenant compaction task we should do all the timelines' layer visibility calculations in a leaf-first order?
|
||||
let readable_points = {
|
||||
let children = self.gc_info.read().unwrap().retain_lsns.clone();
|
||||
|
||||
let mut readable_points = Vec::with_capacity(children.len() + 1);
|
||||
for (child_lsn, _child_timeline_id, child_keyspace) in &children {
|
||||
let keyspace = match child_keyspace {
|
||||
Some(ks) => ks.clone(),
|
||||
None => {
|
||||
// The child has not posted information about which parts of the keyspace they depend on: presume they depend on all of it.
|
||||
let (mut keyspace, sparse_keyspace) =
|
||||
self.collect_keyspace(*child_lsn, ctx).await?;
|
||||
keyspace.merge(&sparse_keyspace.0);
|
||||
keyspace
|
||||
}
|
||||
};
|
||||
readable_points.push((*child_lsn, keyspace));
|
||||
}
|
||||
readable_points.push((head_lsn, head_keyspace));
|
||||
readable_points
|
||||
};
|
||||
|
||||
let (layer_visibility, shadow) = layer_map.get_visibility(readable_points);
|
||||
for (layer_desc, visibility) in layer_visibility {
|
||||
// FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one
|
||||
let layer = layer_manager.get_from_desc(&layer_desc);
|
||||
if matches!(visibility, LayerVisibility::Visible) {
|
||||
visible_size += layer.metadata().file_size;
|
||||
}
|
||||
|
||||
layer.access_stats().set_visibility(visibility);
|
||||
}
|
||||
|
||||
if let Some(ancestor) = &self.ancestor_timeline {
|
||||
// Having calculated the readable keyspace after walking back through all this timeline's layers, the resulting keyspace is the remaining
|
||||
// keys for which reads may still fall through to the parent branch. Notify the parent branch of this, so that they may GC layers which
|
||||
// do not overlap with this keyspace, and so that they may use this as an input to their own visibility updates.
|
||||
ancestor
|
||||
.gc_info
|
||||
.write()
|
||||
.unwrap()
|
||||
.notify_child_keyspace(self.timeline_id, shadow);
|
||||
}
|
||||
|
||||
// Also include in the visible size all the layers which we would never update visibility on
|
||||
// TODO: getter that doesn't spuriously construct a Vec<>
|
||||
for layer in layer_map.get_level0_deltas().unwrap() {
|
||||
visible_size += layer.file_size;
|
||||
}
|
||||
self.metrics.visible_physical_size_gauge.set(visible_size);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
|
||||
/// as Level 1 files.
|
||||
async fn compact_level0(
|
||||
@@ -450,7 +381,7 @@ impl Timeline {
|
||||
};
|
||||
|
||||
let begin = tokio::time::Instant::now();
|
||||
let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
|
||||
let phase1_layers_locked = self.layers.read().await;
|
||||
let now = tokio::time::Instant::now();
|
||||
stats.read_lock_acquisition_micros =
|
||||
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
|
||||
@@ -470,9 +401,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
|
||||
async fn compact_level0_phase1(
|
||||
self: &Arc<Self>,
|
||||
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
|
||||
async fn compact_level0_phase1<'a>(
|
||||
self: &'a Arc<Self>,
|
||||
guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
|
||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
@@ -486,6 +417,7 @@ impl Timeline {
|
||||
.map(|x| guard.get_from_desc(&x))
|
||||
.collect_vec();
|
||||
stats.level0_deltas_count = Some(level0_deltas.len());
|
||||
|
||||
// Only compact if enough layers have accumulated.
|
||||
let threshold = self.get_compaction_threshold();
|
||||
if level0_deltas.is_empty() || level0_deltas.len() < threshold {
|
||||
@@ -516,6 +448,22 @@ impl Timeline {
|
||||
let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
|
||||
let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
|
||||
|
||||
// Accumulate the size of layers in `deltas_to_compact`
|
||||
let mut deltas_to_compact_bytes = 0;
|
||||
|
||||
// Under normal circumstances, we will accumulate up to compaction_interval L0s of size
|
||||
// checkpoint_distance each. To avoid edge cases using extra system resources, bound our
|
||||
// work in this function to only operate on this much delta data at once.
|
||||
//
|
||||
// Take the max of the configured value & the default, so that tests that configure tiny values
|
||||
// can still use a sensible amount of memory, but if a deployed system configures bigger values we
|
||||
// still let them compact a full stack of L0s in one go.
|
||||
let delta_size_limit = std::cmp::max(
|
||||
self.get_compaction_threshold(),
|
||||
DEFAULT_COMPACTION_THRESHOLD,
|
||||
) as u64
|
||||
* std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
|
||||
|
||||
deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
|
||||
for l in level0_deltas_iter {
|
||||
let lsn_range = &l.layer_desc().lsn_range;
|
||||
@@ -524,7 +472,20 @@ impl Timeline {
|
||||
break;
|
||||
}
|
||||
deltas_to_compact.push(l.download_and_keep_resident().await?);
|
||||
deltas_to_compact_bytes += l.metadata().file_size;
|
||||
prev_lsn_end = lsn_range.end;
|
||||
|
||||
if deltas_to_compact_bytes >= delta_size_limit {
|
||||
info!(
|
||||
l0_deltas_selected = deltas_to_compact.len(),
|
||||
l0_deltas_total = level0_deltas.len(),
|
||||
"L0 compaction picker hit max delta layer size limit: {}",
|
||||
delta_size_limit
|
||||
);
|
||||
|
||||
// Proceed with compaction, but only a subset of L0s
|
||||
break;
|
||||
}
|
||||
}
|
||||
let lsn_range = Range {
|
||||
start: deltas_to_compact
|
||||
@@ -1061,7 +1022,7 @@ impl Timeline {
|
||||
"enhanced legacy compaction currently does not support retain_lsns (branches)"
|
||||
)));
|
||||
}
|
||||
let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
|
||||
let gc_cutoff = gc_info.cutoffs.select_min();
|
||||
let mut selected_layers = Vec::new();
|
||||
// TODO: consider retain_lsns
|
||||
drop(gc_info);
|
||||
@@ -1079,10 +1040,12 @@ impl Timeline {
|
||||
);
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, collect the layer information to decide when to split the new delta layers.
|
||||
let mut all_key_values = Vec::new();
|
||||
let mut downloaded_layers = Vec::new();
|
||||
let mut delta_split_points = BTreeSet::new();
|
||||
for layer in &layer_selection {
|
||||
all_key_values.extend(layer.load_key_values(ctx).await?);
|
||||
let resident_layer = layer.download_and_keep_resident().await?;
|
||||
downloaded_layers.push(resident_layer);
|
||||
|
||||
let desc = layer.layer_desc();
|
||||
if desc.is_delta() {
|
||||
// TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
|
||||
@@ -1092,44 +1055,28 @@ impl Timeline {
|
||||
delta_split_points.insert(key_range.end);
|
||||
}
|
||||
}
|
||||
// Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
|
||||
// image layers, make image appear before than delta.
|
||||
struct ValueWrapper<'a>(&'a crate::repository::Value);
|
||||
impl Ord for ValueWrapper<'_> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
use crate::repository::Value;
|
||||
use std::cmp::Ordering;
|
||||
match (self.0, other.0) {
|
||||
(Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
|
||||
(Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
|
||||
_ => Ordering::Equal,
|
||||
}
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut image_layers = Vec::new();
|
||||
for resident_layer in &downloaded_layers {
|
||||
if resident_layer.layer_desc().is_delta() {
|
||||
let layer = resident_layer.get_as_delta(ctx).await?;
|
||||
delta_layers.push(layer);
|
||||
} else {
|
||||
let layer = resident_layer.get_as_image(ctx).await?;
|
||||
image_layers.push(layer);
|
||||
}
|
||||
}
|
||||
impl PartialOrd for ValueWrapper<'_> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl PartialEq for ValueWrapper<'_> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == std::cmp::Ordering::Equal
|
||||
}
|
||||
}
|
||||
impl Eq for ValueWrapper<'_> {}
|
||||
all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
|
||||
(k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
|
||||
});
|
||||
let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
|
||||
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
|
||||
// Data of the same key.
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
|
||||
let mut last_key: Option<Key> = None;
|
||||
|
||||
/// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
|
||||
async fn flush_accumulated_states(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
|
||||
accumulated_values: &[(Key, Lsn, crate::repository::Value)],
|
||||
horizon: Lsn,
|
||||
) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
|
||||
let mut base_image = None;
|
||||
@@ -1230,7 +1177,7 @@ impl Timeline {
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
|
||||
&(Key::MIN..Key::MAX), // covers the full key range
|
||||
gc_cutoff,
|
||||
ctx,
|
||||
)
|
||||
@@ -1240,20 +1187,24 @@ impl Timeline {
|
||||
let delta_split_points = delta_split_points.into_iter().collect_vec();
|
||||
let mut current_delta_split_point = 0;
|
||||
let mut delta_layers = Vec::new();
|
||||
for item @ (key, _, _) in &all_key_values {
|
||||
if &last_key == key {
|
||||
accumulated_values.push(item);
|
||||
while let Some((key, lsn, val)) = merge_iter.next().await? {
|
||||
if last_key.is_none() || last_key.as_ref() == Some(&key) {
|
||||
if last_key.is_none() {
|
||||
last_key = Some(key);
|
||||
}
|
||||
accumulated_values.push((key, lsn, val));
|
||||
} else {
|
||||
let last_key = last_key.as_mut().unwrap();
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
|
||||
flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
image_layer_writer.put_image(last_key, image, ctx).await?;
|
||||
image_layer_writer.put_image(*last_key, image, ctx).await?;
|
||||
delta_values.extend(deltas);
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
&mut delta_values,
|
||||
last_key,
|
||||
*last_key,
|
||||
&delta_split_points,
|
||||
&mut current_delta_split_point,
|
||||
self,
|
||||
@@ -1263,11 +1214,12 @@ impl Timeline {
|
||||
.await?,
|
||||
);
|
||||
accumulated_values.clear();
|
||||
accumulated_values.push(item);
|
||||
last_key = *key;
|
||||
*last_key = key;
|
||||
accumulated_values.push((key, lsn, val));
|
||||
}
|
||||
}
|
||||
|
||||
let last_key = last_key.expect("no keys produced during compaction");
|
||||
// TODO: move this part to the loop body
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
|
||||
|
||||
@@ -148,14 +148,14 @@ async fn cleanup_remaining_timeline_fs_traces(
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
async fn remove_timeline_from_tenant(
|
||||
tenant: &Tenant,
|
||||
timeline: &Timeline,
|
||||
timeline_id: TimelineId,
|
||||
_: &DeletionGuard, // using it as a witness
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove the timeline from the map.
|
||||
let mut timelines = tenant.timelines.lock().unwrap();
|
||||
let children_exist = timelines
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
|
||||
// We already deleted the layer files, so it's probably best to panic.
|
||||
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
|
||||
@@ -163,14 +163,8 @@ async fn remove_timeline_from_tenant(
|
||||
panic!("Timeline grew children while we removed layer files");
|
||||
}
|
||||
|
||||
// Unlink from parent
|
||||
if let Some(ancestor) = timeline.get_ancestor_timeline() {
|
||||
let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
|
||||
ancestor_gc_info.remove_child(timeline.timeline_id);
|
||||
}
|
||||
|
||||
timelines
|
||||
.remove(&timeline.timeline_id)
|
||||
.remove(&timeline_id)
|
||||
.expect("timeline that we were deleting was concurrently removed from 'timelines' map");
|
||||
|
||||
drop(timelines);
|
||||
@@ -188,13 +182,15 @@ async fn remove_timeline_from_tenant(
|
||||
/// 5. Delete index part
|
||||
/// 6. Delete meta, timeline directory
|
||||
/// 7. Delete mark file
|
||||
///
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are three entrypoints to the process:
|
||||
/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
|
||||
/// and we possibly neeed to continue deletion of remote files.
|
||||
/// and we possibly neeed to continue deletion of remote files.
|
||||
/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
|
||||
/// index but still have local metadata, timeline directory and delete mark.
|
||||
/// index but still have local metadata, timeline directory and delete mark.
|
||||
///
|
||||
/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTimelineFlow {
|
||||
@@ -299,9 +295,6 @@ impl DeleteTimelineFlow {
|
||||
{
|
||||
let mut locked = tenant.timelines.lock().unwrap();
|
||||
locked.insert(timeline_id, Arc::clone(&timeline));
|
||||
|
||||
// Note that we do not insert this into the parent branch's GcInfo: the parent is not obliged to retain
|
||||
// any data for child timelines being deleted.
|
||||
}
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
@@ -422,7 +415,7 @@ impl DeleteTimelineFlow {
|
||||
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
|
||||
remove_timeline_from_tenant(tenant, timeline, &guard).await?;
|
||||
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ use crate::{
|
||||
},
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
};
|
||||
use pageserver_api::models::detach_ancestor::AncestorDetached;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
|
||||
@@ -39,6 +40,9 @@ pub(crate) enum Error {
|
||||
|
||||
#[error("unexpected error")]
|
||||
Unexpected(#[source] anyhow::Error),
|
||||
|
||||
#[error("failpoint: {}", .0)]
|
||||
Failpoint(&'static str),
|
||||
}
|
||||
|
||||
impl From<Error> for ApiError {
|
||||
@@ -57,11 +61,41 @@ impl From<Error> for ApiError {
|
||||
| e @ Error::CopyDeltaPrefix(_)
|
||||
| e @ Error::UploadRewritten(_)
|
||||
| e @ Error::CopyFailed(_)
|
||||
| e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
|
||||
| e @ Error::Unexpected(_)
|
||||
| e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::upload_queue::NotInitialized> for Error {
|
||||
fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self {
|
||||
// treat all as shutting down signals, even though that is not entirely correct
|
||||
// (uninitialized state)
|
||||
Error::ShuttingDown
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FlushLayerError> for Error {
|
||||
fn from(value: FlushLayerError) -> Self {
|
||||
match value {
|
||||
FlushLayerError::Cancelled => Error::ShuttingDown,
|
||||
FlushLayerError::NotRunning(_) => {
|
||||
// FIXME(#6424): technically statically unreachable right now, given how we never
|
||||
// drop the sender
|
||||
Error::ShuttingDown
|
||||
}
|
||||
FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => {
|
||||
Error::FlushAncestor(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum Progress {
|
||||
Prepared(completion::Completion, PreparedTimelineDetach),
|
||||
Done(AncestorDetached),
|
||||
}
|
||||
|
||||
pub(crate) struct PreparedTimelineDetach {
|
||||
layers: Vec<Layer>,
|
||||
}
|
||||
@@ -88,7 +122,7 @@ pub(super) async fn prepare(
|
||||
tenant: &Tenant,
|
||||
options: Options,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
|
||||
) -> Result<Progress, Error> {
|
||||
use Error::*;
|
||||
|
||||
let Some((ancestor, ancestor_lsn)) = detached
|
||||
@@ -96,15 +130,67 @@ pub(super) async fn prepare(
|
||||
.as_ref()
|
||||
.map(|tl| (tl.clone(), detached.ancestor_lsn))
|
||||
else {
|
||||
// TODO: check if we have already been detached; for this we need to read the stored data
|
||||
// on remote client, for that we need a follow-up which makes uploads cheaper and maintains
|
||||
// a projection of the commited data.
|
||||
{
|
||||
let accessor = detached.remote_client.initialized_upload_queue()?;
|
||||
|
||||
// we are safe to inspect the latest uploaded, because we can only witness this after
|
||||
// restart is complete and ancestor is no more.
|
||||
let latest = accessor.latest_uploaded_index_part();
|
||||
if !latest.lineage.is_detached_from_original_ancestor() {
|
||||
return Err(NoAncestor);
|
||||
}
|
||||
}
|
||||
|
||||
// detached has previously been detached; let's inspect each of the current timelines and
|
||||
// report back the timelines which have been reparented by our detach
|
||||
let mut all_direct_children = tenant
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
|
||||
.map(|tl| (tl.ancestor_lsn, tl.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut any_shutdown = false;
|
||||
|
||||
all_direct_children.retain(
|
||||
|(_, tl)| match tl.remote_client.initialized_upload_queue() {
|
||||
Ok(accessor) => accessor
|
||||
.latest_uploaded_index_part()
|
||||
.lineage
|
||||
.is_reparented(),
|
||||
Err(_shutdownalike) => {
|
||||
// not 100% a shutdown, but let's bail early not to give inconsistent results in
|
||||
// sharded enviroment.
|
||||
any_shutdown = true;
|
||||
true
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
if any_shutdown {
|
||||
// it could be one or many being deleted; have client retry
|
||||
return Err(Error::ShuttingDown);
|
||||
}
|
||||
|
||||
let mut reparented = all_direct_children;
|
||||
// why this instead of hashset? there is a reason, but I've forgotten it many times.
|
||||
//
|
||||
// the error is wrong per openapi
|
||||
return Err(NoAncestor);
|
||||
// maybe if this was a hashset we would not be able to distinguish some race condition.
|
||||
reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
|
||||
|
||||
return Ok(Progress::Done(AncestorDetached {
|
||||
reparented_timelines: reparented
|
||||
.into_iter()
|
||||
.map(|(_, tl)| tl.timeline_id)
|
||||
.collect(),
|
||||
}));
|
||||
};
|
||||
|
||||
if !ancestor_lsn.is_valid() {
|
||||
// rare case, probably wouldn't even load
|
||||
tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
|
||||
return Err(NoAncestor);
|
||||
}
|
||||
|
||||
@@ -131,6 +217,15 @@ pub(super) async fn prepare(
|
||||
|
||||
let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
|
||||
|
||||
utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
|
||||
|
||||
fail::fail_point!(
|
||||
"timeline-detach-ancestor::before_starting_after_locking",
|
||||
|_| Err(Error::Failpoint(
|
||||
"timeline-detach-ancestor::before_starting_after_locking"
|
||||
))
|
||||
);
|
||||
|
||||
if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
|
||||
let span =
|
||||
tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
|
||||
@@ -151,7 +246,7 @@ pub(super) async fn prepare(
|
||||
}
|
||||
};
|
||||
|
||||
res.map_err(FlushAncestor)?;
|
||||
res?;
|
||||
|
||||
// we do not need to wait for uploads to complete but we do need `struct Layer`,
|
||||
// copying delta prefix is unsupported currently for `InMemoryLayer`.
|
||||
@@ -159,7 +254,7 @@ pub(super) async fn prepare(
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"froze and flushed the ancestor"
|
||||
);
|
||||
Ok(())
|
||||
Ok::<_, Error>(())
|
||||
}
|
||||
.instrument(span)
|
||||
.await?;
|
||||
@@ -283,7 +378,7 @@ pub(super) async fn prepare(
|
||||
|
||||
let prepared = PreparedTimelineDetach { layers: new_layers };
|
||||
|
||||
Ok((guard, prepared))
|
||||
Ok(Progress::Prepared(guard, prepared))
|
||||
}
|
||||
|
||||
fn partition_work(
|
||||
@@ -350,7 +445,11 @@ async fn copy_lsn_prefix(
|
||||
target_timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<ResidentLayer>, Error> {
|
||||
use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
|
||||
use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown};
|
||||
|
||||
if target_timeline.cancel.is_cancelled() {
|
||||
return Err(ShuttingDown);
|
||||
}
|
||||
|
||||
tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
|
||||
|
||||
@@ -529,7 +628,7 @@ pub(super) async fn complete(
|
||||
match res {
|
||||
Ok(Some(timeline)) => {
|
||||
tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
|
||||
reparented.push(timeline.timeline_id);
|
||||
reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
|
||||
}
|
||||
Ok(None) => {
|
||||
// lets just ignore this for now. one or all reparented timelines could had
|
||||
@@ -551,5 +650,12 @@ pub(super) async fn complete(
|
||||
tracing::info!("failed to reparent some candidates");
|
||||
}
|
||||
|
||||
reparented.sort_unstable();
|
||||
|
||||
let reparented = reparented
|
||||
.into_iter()
|
||||
.map(|(_, timeline_id)| timeline_id)
|
||||
.collect();
|
||||
|
||||
Ok(reparented)
|
||||
}
|
||||
|
||||
@@ -255,14 +255,6 @@ impl LayerManager {
|
||||
new_layer.layer_desc().lsn_range
|
||||
);
|
||||
|
||||
// Transfer visibilty hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to
|
||||
// be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
|
||||
// always marking rewritten layers as visible.
|
||||
new_layer
|
||||
.as_ref()
|
||||
.access_stats()
|
||||
.set_visibility(old_layer.access_stats().get_visibility());
|
||||
|
||||
// Safety: we may never rewrite the same file in-place. Callers are responsible
|
||||
// for ensuring that they only rewrite layers after something changes the path,
|
||||
// such as an increment in the generation number.
|
||||
@@ -347,6 +339,10 @@ impl LayerManager {
|
||||
self.layer_fmgr.contains(layer)
|
||||
}
|
||||
|
||||
pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||
self.layer_fmgr.contains_key(key)
|
||||
}
|
||||
|
||||
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
|
||||
self.layer_fmgr.0.keys().cloned().collect_vec()
|
||||
}
|
||||
@@ -371,6 +367,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||
.clone()
|
||||
}
|
||||
|
||||
fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||
self.0.contains_key(key)
|
||||
}
|
||||
|
||||
pub(crate) fn insert(&mut self, layer: T) {
|
||||
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
|
||||
if present.is_some() && cfg!(debug_assertions) {
|
||||
|
||||
@@ -11,11 +11,11 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
||||
/// Calculation consists of two stages:
|
||||
///
|
||||
/// 1. Initial size calculation. That might take a long time, because it requires
|
||||
/// reading all layers containing relation sizes at `initial_part_end`.
|
||||
/// reading all layers containing relation sizes at `initial_part_end`.
|
||||
///
|
||||
/// 2. Collecting an incremental part and adding that to the initial size.
|
||||
/// Increments are appended on walreceiver writing new timeline data,
|
||||
/// which result in increase or decrease of the logical size.
|
||||
/// Increments are appended on walreceiver writing new timeline data,
|
||||
/// which result in increase or decrease of the logical size.
|
||||
pub(super) struct LogicalSize {
|
||||
/// Size, potentially slow to compute. Calculating this might require reading multiple
|
||||
/// layers, and even ancestor's layers.
|
||||
@@ -45,17 +45,17 @@ pub(super) struct LogicalSize {
|
||||
/// Size shouldn't ever be negative, but this is signed for two reasons:
|
||||
///
|
||||
/// 1. If we initialized the "baseline" size lazily, while we already
|
||||
/// process incoming WAL, the incoming WAL records could decrement the
|
||||
/// variable and temporarily make it negative. (This is just future-proofing;
|
||||
/// the initialization is currently not done lazily.)
|
||||
/// process incoming WAL, the incoming WAL records could decrement the
|
||||
/// variable and temporarily make it negative. (This is just future-proofing;
|
||||
/// the initialization is currently not done lazily.)
|
||||
///
|
||||
/// 2. If there is a bug and we e.g. forget to increment it in some cases
|
||||
/// when size grows, but remember to decrement it when it shrinks again, the
|
||||
/// variable could go negative. In that case, it seems better to at least
|
||||
/// try to keep tracking it, rather than clamp or overflow it. Note that
|
||||
/// get_current_logical_size() will clamp the returned value to zero if it's
|
||||
/// negative, and log an error. Could set it permanently to zero or some
|
||||
/// special value to indicate "broken" instead, but this will do for now.
|
||||
/// when size grows, but remember to decrement it when it shrinks again, the
|
||||
/// variable could go negative. In that case, it seems better to at least
|
||||
/// try to keep tracking it, rather than clamp or overflow it. Note that
|
||||
/// get_current_logical_size() will clamp the returned value to zero if it's
|
||||
/// negative, and log an error. Could set it permanently to zero or some
|
||||
/// special value to indicate "broken" instead, but this will do for now.
|
||||
///
|
||||
/// Note that we also expose a copy of this value as a prometheus metric,
|
||||
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
//! To do so, a current implementation needs to do the following:
|
||||
//!
|
||||
//! * acknowledge the timelines that it needs to stream WAL into.
|
||||
//! Pageserver is able to dynamically (un)load tenants on attach and detach,
|
||||
//! hence WAL receiver needs to react on such events.
|
||||
//! Pageserver is able to dynamically (un)load tenants on attach and detach,
|
||||
//! hence WAL receiver needs to react on such events.
|
||||
//!
|
||||
//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
|
||||
//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
|
||||
//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
|
||||
//! Without this data, no WAL streaming is possible currently.
|
||||
//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
|
||||
//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
|
||||
//! Without this data, no WAL streaming is possible currently.
|
||||
//!
|
||||
//! Only one active WAL streaming connection is allowed at a time.
|
||||
//! The connection is supposed to be updated periodically, based on safekeeper timeline data.
|
||||
|
||||
@@ -1118,7 +1118,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_connection_no_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("no_connection_no_candidate")?;
|
||||
let harness = TenantHarness::create("no_connection_no_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1151,7 +1151,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn connection_no_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("connection_no_candidate")?;
|
||||
let harness = TenantHarness::create("connection_no_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1216,7 +1216,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_connection_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("no_connection_candidate")?;
|
||||
let harness = TenantHarness::create("no_connection_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1279,7 +1279,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
|
||||
let harness = TenantHarness::create("candidate_with_many_connection_failures").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1319,7 +1319,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
|
||||
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -1385,7 +1385,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
|
||||
let harness =
|
||||
TenantHarness::create("timeout_connection_threshold_current_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -1448,7 +1449,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
|
||||
let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let new_lsn = Lsn(100_100).align();
|
||||
@@ -1550,7 +1551,7 @@ mod tests {
|
||||
// and pageserver should prefer to connect to it.
|
||||
let test_az = Some("test_az".to_owned());
|
||||
|
||||
let harness = TenantHarness::create("switch_to_same_availability_zone")?;
|
||||
let harness = TenantHarness::create("switch_to_same_availability_zone").await?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
state.conf.availability_zone.clone_from(&test_az);
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
|
||||
@@ -228,18 +228,20 @@ impl UploadQueue {
|
||||
Ok(self.initialized_mut().expect("we just set it"))
|
||||
}
|
||||
|
||||
pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
pub(crate) fn initialized_mut(
|
||||
&mut self,
|
||||
) -> Result<&mut UploadQueueInitialized, NotInitialized> {
|
||||
use UploadQueue::*;
|
||||
match self {
|
||||
Uninitialized => Err(NotInitialized::Uninitialized.into()),
|
||||
Uninitialized => Err(NotInitialized::Uninitialized),
|
||||
Initialized(x) => {
|
||||
if x.shutting_down {
|
||||
Err(NotInitialized::ShuttingDown.into())
|
||||
Err(NotInitialized::ShuttingDown)
|
||||
} else {
|
||||
Ok(x)
|
||||
}
|
||||
}
|
||||
Stopped(_) => Err(NotInitialized::Stopped.into()),
|
||||
Stopped(_) => Err(NotInitialized::Stopped),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -20,11 +20,13 @@ use std::num::NonZeroUsize;
|
||||
|
||||
use bytes::BytesMut;
|
||||
use pageserver_api::key::Key;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::vec_map::VecMap;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
@@ -68,7 +70,7 @@ impl VectoredRead {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) enum VectoredReadExtended {
|
||||
Yes,
|
||||
No,
|
||||
@@ -91,7 +93,7 @@ impl VectoredReadBuilder {
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: Option<usize>,
|
||||
max_read_size: usize,
|
||||
) -> Self {
|
||||
let mut blobs_at = VecMap::default();
|
||||
blobs_at
|
||||
@@ -102,10 +104,9 @@ impl VectoredReadBuilder {
|
||||
start: start_offset,
|
||||
end: end_offset,
|
||||
blobs_at,
|
||||
max_read_size,
|
||||
max_read_size: Some(max_read_size),
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to extend the current read with a new blob if the start
|
||||
/// offset matches with the current end of the vectored read
|
||||
/// and the resuting size is below the max read size
|
||||
@@ -164,7 +165,7 @@ pub struct VectoredReadPlanner {
|
||||
// Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
|
||||
prev: Option<(Key, Lsn, u64, BlobFlag)>,
|
||||
|
||||
max_read_size: Option<usize>,
|
||||
max_read_size: usize,
|
||||
}
|
||||
|
||||
impl VectoredReadPlanner {
|
||||
@@ -172,20 +173,7 @@ impl VectoredReadPlanner {
|
||||
Self {
|
||||
blobs: BTreeMap::new(),
|
||||
prev: None,
|
||||
max_read_size: Some(max_read_size),
|
||||
}
|
||||
}
|
||||
|
||||
/// This function should *only* be used if the caller has a way to control the limit. e.g., in [`StreamingVectoredReadPlanner`],
|
||||
/// it uses the vectored read planner to avoid duplicated logic on handling blob start/end, while expecting the vectored
|
||||
/// read planner to give a single read to a continuous range of bytes in the image layer. Therefore, it does not need the
|
||||
/// code path to split reads into chunks of `max_read_size`, and controls the read size itself.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn new_caller_controlled_max_limit() -> Self {
|
||||
Self {
|
||||
blobs: BTreeMap::new(),
|
||||
prev: None,
|
||||
max_read_size: None,
|
||||
max_read_size,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -203,9 +191,9 @@ impl VectoredReadPlanner {
|
||||
///
|
||||
/// The `flag` argument has two interesting values:
|
||||
/// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
|
||||
/// This is used for WAL records that `will_init`.
|
||||
/// This is used for WAL records that `will_init`.
|
||||
/// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
|
||||
/// if the blob is cached.
|
||||
/// if the blob is cached.
|
||||
pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
|
||||
// Implementation note: internally lag behind by one blob such that
|
||||
// we have a start and end offset when initialising [`VectoredRead`]
|
||||
@@ -315,7 +303,7 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
read.size(),
|
||||
buf.capacity()
|
||||
);
|
||||
let buf = self
|
||||
let mut buf = self
|
||||
.file
|
||||
.read_exact_at(buf.slice(0..read.size()), read.start, ctx)
|
||||
.await?
|
||||
@@ -337,38 +325,68 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
.chain(std::iter::once(None)),
|
||||
);
|
||||
|
||||
// Some scratch space, put here for reusing the allocation
|
||||
let mut decompressed_vec = Vec::new();
|
||||
|
||||
for ((offset, meta), next) in pairs {
|
||||
let offset_in_buf = offset - start_offset;
|
||||
let first_len_byte = buf[offset_in_buf as usize];
|
||||
|
||||
// Each blob is prefixed by a header containing it's size.
|
||||
// Each blob is prefixed by a header containing its size and compression information.
|
||||
// Extract the size and skip that header to find the start of the data.
|
||||
// The size can be 1 or 4 bytes. The most significant bit is 0 in the
|
||||
// 1 byte case and 1 in the 4 byte case.
|
||||
let (size_length, blob_size) = if first_len_byte < 0x80 {
|
||||
(1, first_len_byte as u64)
|
||||
let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 {
|
||||
(1, first_len_byte as u64, BYTE_UNCOMPRESSED)
|
||||
} else {
|
||||
let mut blob_size_buf = [0u8; 4];
|
||||
let offset_in_buf = offset_in_buf as usize;
|
||||
|
||||
blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
|
||||
blob_size_buf[0] &= 0x7f;
|
||||
(4, u32::from_be_bytes(blob_size_buf) as u64)
|
||||
blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
|
||||
|
||||
let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
|
||||
(
|
||||
4,
|
||||
u32::from_be_bytes(blob_size_buf) as u64,
|
||||
compression_bits,
|
||||
)
|
||||
};
|
||||
|
||||
let start = offset_in_buf + size_length;
|
||||
let end = match next {
|
||||
let start_raw = offset_in_buf + size_length;
|
||||
let end_raw = match next {
|
||||
Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
|
||||
None => start + blob_size,
|
||||
None => start_raw + blob_size,
|
||||
};
|
||||
|
||||
assert_eq!(end - start, blob_size);
|
||||
assert_eq!(end_raw - start_raw, blob_size);
|
||||
let (start, end);
|
||||
if compression_bits == BYTE_UNCOMPRESSED {
|
||||
start = start_raw as usize;
|
||||
end = end_raw as usize;
|
||||
} else if compression_bits == BYTE_ZSTD {
|
||||
let mut decoder =
|
||||
async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
|
||||
decoder
|
||||
.write_all(&buf[start_raw as usize..end_raw as usize])
|
||||
.await?;
|
||||
decoder.flush().await?;
|
||||
start = buf.len();
|
||||
buf.extend_from_slice(&decompressed_vec);
|
||||
end = buf.len();
|
||||
decompressed_vec.clear();
|
||||
} else {
|
||||
let error = std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("invalid compression byte {compression_bits:x}"),
|
||||
);
|
||||
return Err(error);
|
||||
}
|
||||
|
||||
metas.push(VectoredBlob {
|
||||
start: start as usize,
|
||||
end: end as usize,
|
||||
start,
|
||||
end,
|
||||
meta: *meta,
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
Ok(VectoredBlobsBuf { buf, blobs: metas })
|
||||
@@ -376,88 +394,120 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
}
|
||||
|
||||
/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
|
||||
/// getting read blobs. It returns a batch when `handle` gets called and when the current key would exceed the read_size and
|
||||
/// max_cnt constraints. Underlying it uses [`VectoredReadPlanner`].
|
||||
#[cfg(test)]
|
||||
/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
|
||||
/// max_cnt constraints.
|
||||
pub struct StreamingVectoredReadPlanner {
|
||||
planner: VectoredReadPlanner,
|
||||
/// Max read size per batch
|
||||
read_builder: Option<VectoredReadBuilder>,
|
||||
// Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
|
||||
prev: Option<(Key, Lsn, u64)>,
|
||||
/// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
|
||||
/// we will produce a single batch instead of split them.
|
||||
max_read_size: u64,
|
||||
/// Max item count per batch
|
||||
max_cnt: usize,
|
||||
/// The first offset of this batch
|
||||
this_batch_first_offset: Option<u64>,
|
||||
/// Size of the current batch
|
||||
cnt: usize,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl StreamingVectoredReadPlanner {
|
||||
pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
|
||||
assert!(max_cnt > 0);
|
||||
assert!(max_read_size > 0);
|
||||
Self {
|
||||
// We want to have exactly one read syscall (plus several others for index lookup) for each `next_batch` call.
|
||||
// Therefore, we enforce `self.max_read_size` by ourselves instead of using the VectoredReadPlanner's capability,
|
||||
// to avoid splitting into two I/Os.
|
||||
planner: VectoredReadPlanner::new_caller_controlled_max_limit(),
|
||||
read_builder: None,
|
||||
prev: None,
|
||||
max_cnt,
|
||||
max_read_size,
|
||||
this_batch_first_offset: None,
|
||||
cnt: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn emit(&mut self, this_batch_first_offset: u64) -> VectoredRead {
|
||||
let planner = std::mem::replace(
|
||||
&mut self.planner,
|
||||
VectoredReadPlanner::new_caller_controlled_max_limit(),
|
||||
);
|
||||
self.this_batch_first_offset = Some(this_batch_first_offset);
|
||||
self.cnt = 1;
|
||||
let mut batch = planner.finish();
|
||||
assert_eq!(batch.len(), 1, "should have exactly one read batch");
|
||||
batch.pop().unwrap()
|
||||
pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option<VectoredRead> {
|
||||
// Implementation note: internally lag behind by one blob such that
|
||||
// we have a start and end offset when initialising [`VectoredRead`]
|
||||
let (prev_key, prev_lsn, prev_offset) = match self.prev {
|
||||
None => {
|
||||
self.prev = Some((key, lsn, offset));
|
||||
return None;
|
||||
}
|
||||
Some(prev) => prev,
|
||||
};
|
||||
|
||||
let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false);
|
||||
|
||||
self.prev = Some((key, lsn, offset));
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
pub fn handle(
|
||||
pub fn handle_range_end(&mut self, offset: u64) -> Option<VectoredRead> {
|
||||
let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev {
|
||||
self.add_blob(prev_key, prev_lsn, prev_offset, offset, true)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.prev = None;
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
fn add_blob(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
offset: u64,
|
||||
flag: BlobFlag,
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
is_last_blob_in_read: bool,
|
||||
) -> Option<VectoredRead> {
|
||||
if let Some(begin_offset) = self.this_batch_first_offset {
|
||||
// Each batch will have at least one item b/c `self.this_batch_first_offset` is set
|
||||
// after one item gets processed
|
||||
if offset - begin_offset > self.max_read_size {
|
||||
self.planner.handle_range_end(offset); // End the current batch with the offset
|
||||
let batch = self.emit(offset); // Produce a batch
|
||||
self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
|
||||
return Some(batch);
|
||||
match &mut self.read_builder {
|
||||
Some(read_builder) => {
|
||||
let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
|
||||
assert_eq!(extended, VectoredReadExtended::Yes);
|
||||
}
|
||||
} else {
|
||||
self.this_batch_first_offset = Some(offset)
|
||||
}
|
||||
if self.cnt >= self.max_cnt {
|
||||
self.planner.handle_range_end(offset); // End the current batch with the offset
|
||||
let batch = self.emit(offset); // Produce a batch
|
||||
self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
|
||||
return Some(batch);
|
||||
}
|
||||
self.planner.handle(key, lsn, offset, flag); // Add this key to the current batch
|
||||
self.cnt += 1;
|
||||
None
|
||||
}
|
||||
None => {
|
||||
self.read_builder = {
|
||||
let mut blobs_at = VecMap::default();
|
||||
blobs_at
|
||||
.append(start_offset, BlobMeta { key, lsn })
|
||||
.expect("First insertion always succeeds");
|
||||
|
||||
pub fn handle_range_end(&mut self, offset: u64) -> VectoredRead {
|
||||
self.planner.handle_range_end(offset);
|
||||
self.emit(offset)
|
||||
Some(VectoredReadBuilder {
|
||||
start: start_offset,
|
||||
end: end_offset,
|
||||
blobs_at,
|
||||
max_read_size: None,
|
||||
})
|
||||
};
|
||||
}
|
||||
}
|
||||
let read_builder = self.read_builder.as_mut().unwrap();
|
||||
self.cnt += 1;
|
||||
if is_last_blob_in_read
|
||||
|| read_builder.size() >= self.max_read_size as usize
|
||||
|| self.cnt >= self.max_cnt
|
||||
{
|
||||
let prev_read_builder = self.read_builder.take();
|
||||
self.cnt = 0;
|
||||
|
||||
// `current_read_builder` is None in the first iteration
|
||||
if let Some(read_builder) = prev_read_builder {
|
||||
return Some(read_builder.build());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use anyhow::Error;
|
||||
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
use super::super::blob_io::tests::{random_array, write_maybe_compressed};
|
||||
use super::*;
|
||||
|
||||
fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
|
||||
@@ -509,8 +559,11 @@ mod tests {
|
||||
planner.handle_range_end(652 * 1024);
|
||||
|
||||
let reads = planner.finish();
|
||||
|
||||
assert_eq!(reads.len(), 6);
|
||||
|
||||
// TODO: could remove zero reads to produce 5 reads here
|
||||
|
||||
for (idx, read) in reads.iter().enumerate() {
|
||||
validate_read(read, ranges[idx]);
|
||||
}
|
||||
@@ -548,4 +601,187 @@ mod tests {
|
||||
validate_read(read, ranges[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn streaming_planner_max_read_size_test() {
|
||||
let max_read_size = 128 * 1024;
|
||||
let key = Key::MIN;
|
||||
let lsn = Lsn(0);
|
||||
|
||||
let blob_descriptions = vec![
|
||||
(key, lsn, 0, BlobFlag::None),
|
||||
(key, lsn, 32 * 1024, BlobFlag::None),
|
||||
(key, lsn, 96 * 1024, BlobFlag::None),
|
||||
(key, lsn, 128 * 1024, BlobFlag::None),
|
||||
(key, lsn, 198 * 1024, BlobFlag::None),
|
||||
(key, lsn, 268 * 1024, BlobFlag::None),
|
||||
(key, lsn, 396 * 1024, BlobFlag::None),
|
||||
(key, lsn, 652 * 1024, BlobFlag::None),
|
||||
];
|
||||
|
||||
let ranges = [
|
||||
&blob_descriptions[0..3],
|
||||
&blob_descriptions[3..5],
|
||||
&blob_descriptions[5..6],
|
||||
&blob_descriptions[6..7],
|
||||
&blob_descriptions[7..],
|
||||
];
|
||||
|
||||
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000);
|
||||
let mut reads = Vec::new();
|
||||
for (key, lsn, offset, _) in blob_descriptions.clone() {
|
||||
reads.extend(planner.handle(key, lsn, offset));
|
||||
}
|
||||
reads.extend(planner.handle_range_end(652 * 1024));
|
||||
|
||||
assert_eq!(reads.len(), ranges.len());
|
||||
|
||||
for (idx, read) in reads.iter().enumerate() {
|
||||
validate_read(read, ranges[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn streaming_planner_max_cnt_test() {
|
||||
let max_read_size = 1024 * 1024;
|
||||
let key = Key::MIN;
|
||||
let lsn = Lsn(0);
|
||||
|
||||
let blob_descriptions = vec![
|
||||
(key, lsn, 0, BlobFlag::None),
|
||||
(key, lsn, 32 * 1024, BlobFlag::None),
|
||||
(key, lsn, 96 * 1024, BlobFlag::None),
|
||||
(key, lsn, 128 * 1024, BlobFlag::None),
|
||||
(key, lsn, 198 * 1024, BlobFlag::None),
|
||||
(key, lsn, 268 * 1024, BlobFlag::None),
|
||||
(key, lsn, 396 * 1024, BlobFlag::None),
|
||||
(key, lsn, 652 * 1024, BlobFlag::None),
|
||||
];
|
||||
|
||||
let ranges = [
|
||||
&blob_descriptions[0..2],
|
||||
&blob_descriptions[2..4],
|
||||
&blob_descriptions[4..6],
|
||||
&blob_descriptions[6..],
|
||||
];
|
||||
|
||||
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
|
||||
let mut reads = Vec::new();
|
||||
for (key, lsn, offset, _) in blob_descriptions.clone() {
|
||||
reads.extend(planner.handle(key, lsn, offset));
|
||||
}
|
||||
reads.extend(planner.handle_range_end(652 * 1024));
|
||||
|
||||
assert_eq!(reads.len(), ranges.len());
|
||||
|
||||
for (idx, read) in reads.iter().enumerate() {
|
||||
validate_read(read, ranges[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn streaming_planner_edge_test() {
|
||||
let max_read_size = 1024 * 1024;
|
||||
let key = Key::MIN;
|
||||
let lsn = Lsn(0);
|
||||
{
|
||||
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
|
||||
let mut reads = Vec::new();
|
||||
reads.extend(planner.handle_range_end(652 * 1024));
|
||||
assert!(reads.is_empty());
|
||||
}
|
||||
{
|
||||
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
|
||||
let mut reads = Vec::new();
|
||||
reads.extend(planner.handle(key, lsn, 0));
|
||||
reads.extend(planner.handle_range_end(652 * 1024));
|
||||
assert_eq!(reads.len(), 1);
|
||||
validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
|
||||
}
|
||||
{
|
||||
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
|
||||
let mut reads = Vec::new();
|
||||
reads.extend(planner.handle(key, lsn, 0));
|
||||
reads.extend(planner.handle(key, lsn, 128 * 1024));
|
||||
reads.extend(planner.handle_range_end(652 * 1024));
|
||||
assert_eq!(reads.len(), 2);
|
||||
validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
|
||||
validate_read(&reads[1], &[(key, lsn, 128 * 1024, BlobFlag::None)]);
|
||||
}
|
||||
{
|
||||
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
|
||||
let mut reads = Vec::new();
|
||||
reads.extend(planner.handle(key, lsn, 0));
|
||||
reads.extend(planner.handle(key, lsn, 128 * 1024));
|
||||
reads.extend(planner.handle_range_end(652 * 1024));
|
||||
assert_eq!(reads.len(), 1);
|
||||
validate_read(
|
||||
&reads[0],
|
||||
&[
|
||||
(key, lsn, 0, BlobFlag::None),
|
||||
(key, lsn, 128 * 1024, BlobFlag::None),
|
||||
],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let (_temp_dir, pathbuf, offsets) =
|
||||
write_maybe_compressed::<true>(blobs, compression, &ctx).await?;
|
||||
|
||||
let file = VirtualFile::open(&pathbuf, &ctx).await?;
|
||||
let file_len = std::fs::metadata(&pathbuf)?.len();
|
||||
|
||||
// Multiply by two (compressed data might need more space), and add a few bytes for the header
|
||||
let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
|
||||
let mut buf = BytesMut::with_capacity(reserved_bytes);
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&file);
|
||||
let meta = BlobMeta {
|
||||
key: Key::MIN,
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
|
||||
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
|
||||
let end = offsets.get(idx + 1).unwrap_or(&file_len);
|
||||
if idx + 1 == offsets.len() {
|
||||
continue;
|
||||
}
|
||||
let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
|
||||
let read = read_builder.build();
|
||||
let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
|
||||
assert_eq!(result.blobs.len(), 1);
|
||||
let read_blob = &result.blobs[0];
|
||||
let read_buf = &result.buf[read_blob.start..read_blob.end];
|
||||
assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
|
||||
buf = result.buf;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_really_big_array() -> Result<(), Error> {
|
||||
let blobs = &[
|
||||
b"test".to_vec(),
|
||||
random_array(10 * PAGE_SZ),
|
||||
b"hello".to_vec(),
|
||||
random_array(66 * PAGE_SZ),
|
||||
vec![0xf3; 24 * PAGE_SZ],
|
||||
b"foobar".to_vec(),
|
||||
];
|
||||
round_trip_test_compressed(blobs, false).await?;
|
||||
round_trip_test_compressed(blobs, true).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_arrays_inc() -> Result<(), Error> {
|
||||
let blobs = (0..PAGE_SZ / 8)
|
||||
.map(|v| random_array(v * 16))
|
||||
.collect::<Vec<_>>();
|
||||
round_trip_test_compressed(&blobs, false).await?;
|
||||
round_trip_test_compressed(&blobs, true).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user