mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 05:00:38 +00:00
Compare commits
464 Commits
skyzh/allo
...
release-pr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c82d646598 | ||
|
|
5f3532970e | ||
|
|
2e681e0ef8 | ||
|
|
8e216a3a59 | ||
|
|
d0a4ae3e8f | ||
|
|
a384d7d501 | ||
|
|
66f53d9d34 | ||
|
|
2af9380962 | ||
|
|
620d50432c | ||
|
|
1d43f3bee8 | ||
|
|
c746678bbc | ||
|
|
9bb4688c54 | ||
|
|
47553dbaf9 | ||
|
|
e50b914a8e | ||
|
|
e33e109403 | ||
|
|
0ee15002fc | ||
|
|
4c7956fa56 | ||
|
|
5a82182c48 | ||
|
|
37e181af8a | ||
|
|
6f4198c78a | ||
|
|
cc1664ef93 | ||
|
|
ebb6e26a64 | ||
|
|
ebc12a388c | ||
|
|
abc1efd5a6 | ||
|
|
6fa1562b57 | ||
|
|
10afac87e7 | ||
|
|
72b3c9cd11 | ||
|
|
232f2447d4 | ||
|
|
a2d2108e6a | ||
|
|
33c0d5e2f4 | ||
|
|
605fb04f89 | ||
|
|
fd1e8ec257 | ||
|
|
be23eae3b6 | ||
|
|
6f70885e11 | ||
|
|
f755979102 | ||
|
|
1d49eefbbb | ||
|
|
6c77638ea1 | ||
|
|
9b26b7bde9 | ||
|
|
93350f7018 | ||
|
|
517a3d0d86 | ||
|
|
27ca1e21be | ||
|
|
1dc01c9bed | ||
|
|
7c4c36f5ac | ||
|
|
a2d623696c | ||
|
|
7f4f1785a8 | ||
|
|
bf2a21567d | ||
|
|
24053ff4ca | ||
|
|
b147439d6b | ||
|
|
54433c0839 | ||
|
|
40bb9ff62a | ||
|
|
4688b815b1 | ||
|
|
0982ca4636 | ||
|
|
7272d9f7b3 | ||
|
|
37d555aa59 | ||
|
|
cae3e2976b | ||
|
|
51ecd1bb37 | ||
|
|
1e6bb48076 | ||
|
|
1470af0b42 | ||
|
|
f92f92b91b | ||
|
|
dbb205ae92 | ||
|
|
85072b715f | ||
|
|
6c86fe7143 | ||
|
|
66d5fe7f5b | ||
|
|
a1b9528757 | ||
|
|
1423bb8aa2 | ||
|
|
332f064a42 | ||
|
|
c962f2b447 | ||
|
|
446b3f9d28 | ||
|
|
23352dc2e9 | ||
|
|
c65fc5a955 | ||
|
|
3e624581cd | ||
|
|
fedf4f169c | ||
|
|
86d5798108 | ||
|
|
8b4088dd8a | ||
|
|
c91905e643 | ||
|
|
44b4e355a2 | ||
|
|
03666a1f37 | ||
|
|
9c92242ca0 | ||
|
|
a354071dd0 | ||
|
|
758680d4f8 | ||
|
|
1738fd0a96 | ||
|
|
87b7edfc72 | ||
|
|
def05700d5 | ||
|
|
b547681e08 | ||
|
|
0fd211537b | ||
|
|
a83bd4e81c | ||
|
|
ecdad5e6d5 | ||
|
|
d028929945 | ||
|
|
7b0e3db868 | ||
|
|
088eb72dd7 | ||
|
|
d550e3f626 | ||
|
|
8c6b41daf5 | ||
|
|
bbb050459b | ||
|
|
cab498c787 | ||
|
|
6359342ffb | ||
|
|
13285c2a5e | ||
|
|
33790d14a3 | ||
|
|
709b8cd371 | ||
|
|
1c9bbf1a92 | ||
|
|
16163fb850 | ||
|
|
73ccc2b08c | ||
|
|
c719be6474 | ||
|
|
718645e56c | ||
|
|
fbc8c36983 | ||
|
|
5519e42612 | ||
|
|
4157eaf4c5 | ||
|
|
60241127e2 | ||
|
|
f7d5322e8b | ||
|
|
41bb9c5280 | ||
|
|
69c0d61c5c | ||
|
|
63cb8ce975 | ||
|
|
907e4aa3c4 | ||
|
|
0a2a84b766 | ||
|
|
85b12ddd52 | ||
|
|
dd76f1eeee | ||
|
|
8963ac85f9 | ||
|
|
4a488b3e24 | ||
|
|
c4987b0b13 | ||
|
|
84b4821118 | ||
|
|
32ba9811f9 | ||
|
|
a0cd64c4d3 | ||
|
|
84687b743d | ||
|
|
b6f93dcec9 | ||
|
|
4f6c594973 | ||
|
|
a750c14735 | ||
|
|
9ce0dd4e55 | ||
|
|
0e1a336607 | ||
|
|
7fc2912d06 | ||
|
|
fdf231c237 | ||
|
|
1e08b5dccc | ||
|
|
030810ed3e | ||
|
|
62b74bdc2c | ||
|
|
8b7e9ed820 | ||
|
|
5dad89acd4 | ||
|
|
547b2d2827 | ||
|
|
93f29a0065 | ||
|
|
4f36494615 | ||
|
|
0a550f3e7d | ||
|
|
4bb9554e4a | ||
|
|
008616cfe6 | ||
|
|
e61ec94fbc | ||
|
|
e5152551ad | ||
|
|
b0822a5499 | ||
|
|
1fb6ab59e8 | ||
|
|
e16439400d | ||
|
|
e401f66698 | ||
|
|
2fa461b668 | ||
|
|
03d90bc0b3 | ||
|
|
268bc890ea | ||
|
|
8a6ee79f6f | ||
|
|
9052c32b46 | ||
|
|
995e729ebe | ||
|
|
76077e1ddf | ||
|
|
0467d88f06 | ||
|
|
f5eec194e7 | ||
|
|
7e00be391d | ||
|
|
d56599df2a | ||
|
|
9d9aab3680 | ||
|
|
a202b1b5cc | ||
|
|
90f731f3b1 | ||
|
|
7736b748d3 | ||
|
|
9c23333cb3 | ||
|
|
66a99009ba | ||
|
|
5d4c57491f | ||
|
|
73935ea3a2 | ||
|
|
32e595d4dd | ||
|
|
b0d69acb07 | ||
|
|
98355a419a | ||
|
|
cfb03d6cf0 | ||
|
|
d81ef3f962 | ||
|
|
5d62c67e75 | ||
|
|
53d53d5b1e | ||
|
|
29fe6ea47a | ||
|
|
640327ccb3 | ||
|
|
7cf0f6b37e | ||
|
|
03c2c569be | ||
|
|
eff6d4538a | ||
|
|
5ef7782e9c | ||
|
|
73101db8c4 | ||
|
|
bccdfc6d39 | ||
|
|
99595813bb | ||
|
|
fe07b54758 | ||
|
|
a42d173e7b | ||
|
|
e07f689238 | ||
|
|
7831eddc88 | ||
|
|
943b1bc80c | ||
|
|
95a184e9b7 | ||
|
|
3fa17e9d17 | ||
|
|
55e0fd9789 | ||
|
|
2a88889f44 | ||
|
|
5bad8126dc | ||
|
|
27bc242085 | ||
|
|
192b49cc6d | ||
|
|
e1b60f3693 | ||
|
|
2804f5323b | ||
|
|
676adc6b32 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
@@ -4,6 +4,7 @@
|
||||
!Cargo.lock
|
||||
!Cargo.toml
|
||||
!Makefile
|
||||
!postgres.mk
|
||||
!rust-toolchain.toml
|
||||
!scripts/ninstall.sh
|
||||
!docker-compose/run-tests.sh
|
||||
|
||||
13
.github/workflows/build-macos.yml
vendored
13
.github/workflows/build-macos.yml
vendored
@@ -94,11 +94,6 @@ jobs:
|
||||
run: |
|
||||
make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Get postgres headers ${{ matrix.postgres-version }}
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Upload "pg_install/${{ matrix.postgres-version }}" artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
@@ -140,6 +135,12 @@ jobs:
|
||||
name: pg_install--v17
|
||||
path: pg_install/v17
|
||||
|
||||
# `actions/download-artifact` doesn't preserve permissions:
|
||||
# https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
|
||||
- name: Make pg_install/v*/bin/* executable
|
||||
run: |
|
||||
chmod +x pg_install/v*/bin/*
|
||||
|
||||
- name: Cache walproposer-lib
|
||||
id: cache_walproposer_lib
|
||||
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
|
||||
@@ -167,7 +168,7 @@ jobs:
|
||||
- name: Build walproposer-lib (only for v17)
|
||||
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
|
||||
run:
|
||||
make walproposer-lib -j$(sysctl -n hw.ncpu)
|
||||
make walproposer-lib -j$(sysctl -n hw.ncpu) PG_INSTALL_CACHED=1
|
||||
|
||||
- name: Upload "build/walproposer-lib" artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
|
||||
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -69,7 +69,7 @@ jobs:
|
||||
submodules: true
|
||||
|
||||
- name: Check for file changes
|
||||
uses: step-security/paths-filter@v3
|
||||
uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2
|
||||
id: files-changed
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
2
.github/workflows/large_oltp_benchmark.yml
vendored
2
.github/workflows/large_oltp_benchmark.yml
vendored
@@ -153,7 +153,7 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Benchmark database maintenance
|
||||
if: ${{ matrix.test_maintenance == 'true' }}
|
||||
if: ${{ matrix.test_maintenance }}
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
submodules: true
|
||||
|
||||
- name: Check for Postgres changes
|
||||
uses: step-security/paths-filter@v3
|
||||
uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3
|
||||
id: files_changed
|
||||
with:
|
||||
token: ${{ github.token }}
|
||||
|
||||
4
.github/workflows/pre-merge-checks.yml
vendored
4
.github/workflows/pre-merge-checks.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
|
||||
- uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
|
||||
id: python-src
|
||||
with:
|
||||
files: |
|
||||
@@ -45,7 +45,7 @@ jobs:
|
||||
poetry.lock
|
||||
pyproject.toml
|
||||
|
||||
- uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
|
||||
- uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
|
||||
id: rust-src
|
||||
with:
|
||||
files: |
|
||||
|
||||
7
.github/workflows/proxy-benchmark.yml
vendored
7
.github/workflows/proxy-benchmark.yml
vendored
@@ -60,22 +60,23 @@ jobs:
|
||||
} >> "$GITHUB_ENV"
|
||||
|
||||
- name: Run proxy-bench
|
||||
run: ./${PROXY_BENCH_PATH}/run.sh
|
||||
run: ${PROXY_BENCH_PATH}/run.sh
|
||||
|
||||
- name: Ingest Bench Results # neon repo script
|
||||
if: success()
|
||||
if: always()
|
||||
run: |
|
||||
mkdir -p $TEST_OUTPUT
|
||||
python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT
|
||||
|
||||
- name: Push Metrics to Proxy perf database
|
||||
if: success()
|
||||
if: always()
|
||||
env:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}"
|
||||
REPORT_FROM: $TEST_OUTPUT
|
||||
run: $NEON_DIR/scripts/generate_and_push_perf_report.sh
|
||||
|
||||
- name: Docker cleanup
|
||||
if: always()
|
||||
run: docker compose down
|
||||
|
||||
- name: Notify Failure
|
||||
|
||||
23
Cargo.lock
generated
23
Cargo.lock
generated
@@ -1279,6 +1279,7 @@ dependencies = [
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"url",
|
||||
"utils",
|
||||
]
|
||||
|
||||
@@ -1316,6 +1317,7 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
"p256 0.13.2",
|
||||
"pageserver_page_api",
|
||||
"postgres",
|
||||
"postgres_initdb",
|
||||
"postgres_versioninfo",
|
||||
@@ -1335,6 +1337,7 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tonic 0.13.1",
|
||||
"tower 0.5.2",
|
||||
"tower-http",
|
||||
"tower-otel",
|
||||
@@ -4408,6 +4411,7 @@ dependencies = [
|
||||
"postgres_backend",
|
||||
"postgres_ffi_types",
|
||||
"postgres_versioninfo",
|
||||
"posthog_client_lite",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
@@ -4418,6 +4422,7 @@ dependencies = [
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror 1.0.69",
|
||||
"tracing",
|
||||
"tracing-utils",
|
||||
"utils",
|
||||
]
|
||||
@@ -4474,12 +4479,14 @@ dependencies = [
|
||||
"bytes",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"postgres_ffi_types",
|
||||
"prost 0.13.5",
|
||||
"prost-types 0.13.5",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tonic 0.13.1",
|
||||
"tonic-build",
|
||||
"utils",
|
||||
@@ -5152,7 +5159,7 @@ dependencies = [
|
||||
"petgraph",
|
||||
"prettyplease",
|
||||
"prost 0.13.5",
|
||||
"prost-types 0.13.3",
|
||||
"prost-types 0.13.5",
|
||||
"regex",
|
||||
"syn 2.0.100",
|
||||
"tempfile",
|
||||
@@ -5195,9 +5202,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "prost-types"
|
||||
version = "0.13.3"
|
||||
version = "0.13.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670"
|
||||
checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
|
||||
dependencies = [
|
||||
"prost 0.13.5",
|
||||
]
|
||||
@@ -6804,6 +6811,7 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"clashmap",
|
||||
"compute_api",
|
||||
"control_plane",
|
||||
"cron",
|
||||
"diesel",
|
||||
@@ -6815,6 +6823,7 @@ dependencies = [
|
||||
"hex",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"json-structural-diff",
|
||||
@@ -6825,6 +6834,7 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
"posthog_client_lite",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"reqwest",
|
||||
@@ -7635,7 +7645,7 @@ dependencies = [
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"prost-build 0.13.3",
|
||||
"prost-types 0.13.3",
|
||||
"prost-types 0.13.5",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
@@ -7647,7 +7657,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f9687bd5bfeafebdded2356950f278bba8226f0b32109537c4253406e09aafe1"
|
||||
dependencies = [
|
||||
"prost 0.13.5",
|
||||
"prost-types 0.13.3",
|
||||
"prost-types 0.13.5",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tonic 0.13.1",
|
||||
@@ -8676,7 +8686,6 @@ dependencies = [
|
||||
"num-iter",
|
||||
"num-rational",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"p256 0.13.2",
|
||||
"parquet",
|
||||
"prettyplease",
|
||||
|
||||
@@ -152,6 +152,7 @@ pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointe
|
||||
procfs = "0.16"
|
||||
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.13.5"
|
||||
prost-types = "0.13.5"
|
||||
rand = "0.8"
|
||||
redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
@@ -199,7 +200,7 @@ tokio-postgres-rustls = "0.12.0"
|
||||
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
|
||||
toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots", "zstd"] }
|
||||
|
||||
@@ -40,6 +40,7 @@ COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
|
||||
COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
|
||||
COPY --chown=nonroot pgxn pgxn
|
||||
COPY --chown=nonroot Makefile Makefile
|
||||
COPY --chown=nonroot postgres.mk postgres.mk
|
||||
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
|
||||
|
||||
ENV BUILD_TYPE=release
|
||||
|
||||
131
Makefile
131
Makefile
@@ -4,11 +4,14 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
# managers.
|
||||
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
|
||||
|
||||
# Supported PostgreSQL versions
|
||||
POSTGRES_VERSIONS = v17 v16 v15 v14
|
||||
|
||||
# CARGO_BUILD_FLAGS: Extra flags to pass to `cargo build`. `--locked`
|
||||
# and `--features testing` are popular examples.
|
||||
#
|
||||
# CARGO_PROFILE: You can also set to override the cargo profile to
|
||||
# use. By default, it is derived from BUILD_TYPE.
|
||||
# CARGO_PROFILE: Set to override the cargo profile to use. By default,
|
||||
# it is derived from BUILD_TYPE.
|
||||
|
||||
# All intermediate build artifacts are stored here.
|
||||
BUILD_DIR := build
|
||||
@@ -95,93 +98,24 @@ CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
|
||||
# Top level Makefile to build Neon and PostgreSQL
|
||||
#
|
||||
.PHONY: all
|
||||
all: neon postgres neon-pg-ext
|
||||
all: neon postgres-install neon-pg-ext
|
||||
|
||||
### Neon Rust bits
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: neon
|
||||
neon: postgres-headers walproposer-lib cargo-target-dir
|
||||
neon: postgres-headers-install walproposer-lib cargo-target-dir
|
||||
+@echo "Compiling Neon"
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE)
|
||||
|
||||
.PHONY: cargo-target-dir
|
||||
cargo-target-dir:
|
||||
# https://github.com/rust-lang/cargo/issues/14281
|
||||
mkdir -p target
|
||||
test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG
|
||||
|
||||
### PostgreSQL parts
|
||||
# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
|
||||
# to avoid the duplication in the future, but it's tolerable for now.
|
||||
#
|
||||
$(BUILD_DIR)/%/config.status:
|
||||
mkdir -p $(BUILD_DIR)
|
||||
test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG
|
||||
|
||||
+@echo "Configuring Postgres $* build"
|
||||
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
|
||||
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
|
||||
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
|
||||
exit 1; }
|
||||
mkdir -p $(BUILD_DIR)/$*
|
||||
|
||||
VERSION=$*; \
|
||||
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
|
||||
(cd $(BUILD_DIR)/$$VERSION && \
|
||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
|
||||
CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
|
||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
|
||||
|
||||
# nicer alias to run 'configure'
|
||||
# Note: I've been unable to use templates for this part of our configuration.
|
||||
# I'm not sure why it wouldn't work, but this is the only place (apart from
|
||||
# the "build-all-versions" entry points) where direct mention of PostgreSQL
|
||||
# versions is used.
|
||||
.PHONY: postgres-configure-v17
|
||||
postgres-configure-v17: $(BUILD_DIR)/v17/config.status
|
||||
.PHONY: postgres-configure-v16
|
||||
postgres-configure-v16: $(BUILD_DIR)/v16/config.status
|
||||
.PHONY: postgres-configure-v15
|
||||
postgres-configure-v15: $(BUILD_DIR)/v15/config.status
|
||||
.PHONY: postgres-configure-v14
|
||||
postgres-configure-v14: $(BUILD_DIR)/v14/config.status
|
||||
|
||||
# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
|
||||
.PHONY: postgres-headers-%
|
||||
postgres-headers-%: postgres-configure-%
|
||||
+@echo "Installing PostgreSQL $* headers"
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install
|
||||
|
||||
# Compile and install PostgreSQL
|
||||
.PHONY: postgres-%
|
||||
postgres-%: postgres-configure-% \
|
||||
postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
|
||||
+@echo "Compiling PostgreSQL $*"
|
||||
$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install
|
||||
+@echo "Compiling libpq $*"
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/src/interfaces/libpq install
|
||||
+@echo "Compiling pg_prewarm $*"
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install
|
||||
+@echo "Compiling pg_buffercache $*"
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install
|
||||
+@echo "Compiling pg_visibility $*"
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install
|
||||
+@echo "Compiling pageinspect $*"
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install
|
||||
+@echo "Compiling pg_trgm $*"
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install
|
||||
+@echo "Compiling amcheck $*"
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install
|
||||
+@echo "Compiling test_decoding $*"
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install
|
||||
|
||||
.PHONY: postgres-check-%
|
||||
postgres-check-%: postgres-%
|
||||
$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check
|
||||
|
||||
.PHONY: neon-pg-ext-%
|
||||
neon-pg-ext-%: postgres-%
|
||||
neon-pg-ext-%: postgres-install-%
|
||||
+@echo "Compiling neon-specific Postgres extensions for $*"
|
||||
mkdir -p $(BUILD_DIR)/pgxn-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
@@ -220,39 +154,14 @@ ifeq ($(UNAME_S),Linux)
|
||||
pg_crc32c.o
|
||||
endif
|
||||
|
||||
# Shorthand to call neon-pg-ext-% target for all Postgres versions
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
neon-pg-ext-v14 \
|
||||
neon-pg-ext-v15 \
|
||||
neon-pg-ext-v16 \
|
||||
neon-pg-ext-v17
|
||||
|
||||
# shorthand to build all Postgres versions
|
||||
.PHONY: postgres
|
||||
postgres: \
|
||||
postgres-v14 \
|
||||
postgres-v15 \
|
||||
postgres-v16 \
|
||||
postgres-v17
|
||||
|
||||
.PHONY: postgres-headers
|
||||
postgres-headers: \
|
||||
postgres-headers-v14 \
|
||||
postgres-headers-v15 \
|
||||
postgres-headers-v16 \
|
||||
postgres-headers-v17
|
||||
|
||||
.PHONY: postgres-check
|
||||
postgres-check: \
|
||||
postgres-check-v14 \
|
||||
postgres-check-v15 \
|
||||
postgres-check-v16 \
|
||||
postgres-check-v17
|
||||
neon-pg-ext: $(foreach pg_version,$(POSTGRES_VERSIONS),neon-pg-ext-$(pg_version))
|
||||
|
||||
# This removes everything
|
||||
.PHONY: distclean
|
||||
distclean:
|
||||
$(RM) -r $(POSTGRES_INSTALL_DIR)
|
||||
$(RM) -r $(POSTGRES_INSTALL_DIR) $(BUILD_DIR)
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
.PHONY: fmt
|
||||
@@ -300,3 +209,19 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
|
||||
.PHONY: setup-pre-commit-hook
|
||||
setup-pre-commit-hook:
|
||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
# Targets for building PostgreSQL are defined in postgres.mk.
|
||||
#
|
||||
# But if the caller has indicated that PostgreSQL is already
|
||||
# installed, by setting the PG_INSTALL_CACHED variable, skip it.
|
||||
ifdef PG_INSTALL_CACHED
|
||||
postgres-install: skip-install
|
||||
$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-install-$(pg_version)): skip-install
|
||||
postgres-headers-install:
|
||||
+@echo "Skipping installation of PostgreSQL headers because PG_INSTALL_CACHED is set"
|
||||
skip-install:
|
||||
+@echo "Skipping PostgreSQL installation because PG_INSTALL_CACHED is set"
|
||||
|
||||
else
|
||||
include postgres.mk
|
||||
endif
|
||||
|
||||
@@ -165,6 +165,7 @@ RUN curl -fsSL \
|
||||
&& rm sql_exporter.tar.gz
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
# Keep the version the same as in compute/compute-node.Dockerfile
|
||||
ENV PROTOC_VERSION=25.1
|
||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
&& unzip -q protoc.zip -d protoc \
|
||||
@@ -179,7 +180,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
|
||||
&& mv s5cmd /usr/local/bin/s5cmd
|
||||
|
||||
# LLVM
|
||||
ENV LLVM_VERSION=19
|
||||
ENV LLVM_VERSION=20
|
||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& apt update \
|
||||
@@ -292,7 +293,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.87.0
|
||||
ENV RUSTC_VERSION=1.88.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
|
||||
@@ -115,6 +115,9 @@ ARG EXTENSIONS=all
|
||||
FROM $BASE_IMAGE_SHA AS build-deps
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Keep in sync with build-tools.Dockerfile
|
||||
ENV PROTOC_VERSION=25.1
|
||||
|
||||
# Use strict mode for bash to catch errors early
|
||||
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
|
||||
|
||||
@@ -149,8 +152,14 @@ RUN case $DEBIAN_VERSION in \
|
||||
libclang-dev \
|
||||
jsonnet \
|
||||
$VERSION_INSTALLS \
|
||||
&& apt clean && rm -rf /var/lib/apt/lists/* && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
&& apt clean && rm -rf /var/lib/apt/lists/* \
|
||||
&& useradd -ms /bin/bash nonroot -b /home \
|
||||
# Install protoc from binary release, since Debian's versions are too old.
|
||||
&& curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
&& unzip -q protoc.zip -d protoc \
|
||||
&& mv protoc/bin/protoc /usr/local/bin/protoc \
|
||||
&& mv protoc/include/google /usr/local/include/google \
|
||||
&& rm -rf protoc.zip protoc
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -171,9 +180,6 @@ RUN cd postgres && \
|
||||
eval $CONFIGURE_CMD && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
|
||||
@@ -1173,7 +1179,7 @@ COPY --from=pgrag-src /ext-src/ /ext-src/
|
||||
# Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
|
||||
WORKDIR /ext-src/onnxruntime-src
|
||||
RUN apt update && apt install --no-install-recommends --no-install-suggests -y \
|
||||
python3 python3-pip python3-venv protobuf-compiler && \
|
||||
python3 python3-pip python3-venv && \
|
||||
apt clean && rm -rf /var/lib/apt/lists/* && \
|
||||
python3 -m venv venv && \
|
||||
. venv/bin/activate && \
|
||||
@@ -1568,20 +1574,20 @@ ARG PG_VERSION
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14") \
|
||||
export PGAUDIT_VERSION=1.6.2 \
|
||||
export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \
|
||||
export PGAUDIT_VERSION=1.6.3 \
|
||||
export PGAUDIT_CHECKSUM=37a8f5a7cc8d9188e536d15cf0fdc457fcdab2547caedb54442c37f124110919 \
|
||||
;; \
|
||||
"v15") \
|
||||
export PGAUDIT_VERSION=1.7.0 \
|
||||
export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \
|
||||
export PGAUDIT_VERSION=1.7.1 \
|
||||
export PGAUDIT_CHECKSUM=e9c8e6e092d82b2f901d72555ce0fe7780552f35f8985573796cd7e64b09d4ec \
|
||||
;; \
|
||||
"v16") \
|
||||
export PGAUDIT_VERSION=16.0 \
|
||||
export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \
|
||||
export PGAUDIT_VERSION=16.1 \
|
||||
export PGAUDIT_CHECKSUM=3bae908ab70ba0c6f51224009dbcfff1a97bd6104c6273297a64292e1b921fee \
|
||||
;; \
|
||||
"v17") \
|
||||
export PGAUDIT_VERSION=17.0 \
|
||||
export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \
|
||||
export PGAUDIT_VERSION=17.1 \
|
||||
export PGAUDIT_CHECKSUM=9c5f37504d393486cc75d2ced83f75f5899be64fa85f689d6babb833b4361e6c \
|
||||
;; \
|
||||
*) \
|
||||
echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \
|
||||
|
||||
@@ -38,6 +38,7 @@ once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
opentelemetry_sdk.workspace = true
|
||||
p256 = { version = "0.13", features = ["pem"] }
|
||||
pageserver_page_api.workspace = true
|
||||
postgres.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
@@ -53,6 +54,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tonic.workspace = true
|
||||
tower-otel.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
|
||||
@@ -36,6 +36,8 @@
|
||||
use std::ffi::OsString;
|
||||
use std::fs::File;
|
||||
use std::process::exit;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::mpsc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
@@ -190,7 +192,9 @@ fn main() -> Result<()> {
|
||||
cgroup: cli.cgroup,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor_addr: cli.vm_monitor_addr,
|
||||
installed_extensions_collection_interval: cli.installed_extensions_collection_interval,
|
||||
installed_extensions_collection_interval: Arc::new(AtomicU64::new(
|
||||
cli.installed_extensions_collection_interval,
|
||||
)),
|
||||
},
|
||||
config,
|
||||
)?;
|
||||
|
||||
@@ -6,7 +6,7 @@ use compute_api::responses::{
|
||||
LfcPrewarmState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
|
||||
};
|
||||
use futures::StreamExt;
|
||||
use futures::future::join_all;
|
||||
@@ -15,17 +15,17 @@ use itertools::Itertools;
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use nix::unistd::Pid;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_page_api::{self as page_api, BaseBackupCompression};
|
||||
use postgres;
|
||||
use postgres::NoTls;
|
||||
use postgres::error::SqlState;
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::net::SocketAddr;
|
||||
use std::os::unix::fs::{PermissionsExt, symlink};
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
@@ -36,6 +36,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
use utils::pid_file;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
use crate::configurator::launch_configurator;
|
||||
use crate::disk_quota::set_disk_quota;
|
||||
@@ -69,6 +70,7 @@ pub static BUILD_TAG: Lazy<String> = Lazy::new(|| {
|
||||
.unwrap_or(BUILD_TAG_DEFAULT)
|
||||
.to_string()
|
||||
});
|
||||
const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600;
|
||||
|
||||
/// Static configuration params that don't change after startup. These mostly
|
||||
/// come from the CLI args, or are derived from them.
|
||||
@@ -102,7 +104,7 @@ pub struct ComputeNodeParams {
|
||||
pub remote_ext_base_url: Option<Url>,
|
||||
|
||||
/// Interval for installed extensions collection
|
||||
pub installed_extensions_collection_interval: u64,
|
||||
pub installed_extensions_collection_interval: Arc<AtomicU64>,
|
||||
}
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
@@ -125,6 +127,9 @@ pub struct ComputeNode {
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
|
||||
/// Handle to the extension stats collection task
|
||||
extension_stats_task: Mutex<Option<tokio::task::JoinHandle<()>>>,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
@@ -218,7 +223,8 @@ pub struct ParsedSpec {
|
||||
pub pageserver_connstr: String,
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
pub storage_auth_token: Option<String>,
|
||||
pub endpoint_storage_addr: Option<SocketAddr>,
|
||||
/// k8s dns name and port
|
||||
pub endpoint_storage_addr: Option<String>,
|
||||
pub endpoint_storage_token: Option<String>,
|
||||
}
|
||||
|
||||
@@ -313,13 +319,10 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
.or(Err("invalid timeline id"))?
|
||||
};
|
||||
|
||||
let endpoint_storage_addr: Option<SocketAddr> = spec
|
||||
let endpoint_storage_addr: Option<String> = spec
|
||||
.endpoint_storage_addr
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr"))
|
||||
.unwrap_or_default()
|
||||
.parse()
|
||||
.ok();
|
||||
.or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr"));
|
||||
let endpoint_storage_token = spec
|
||||
.endpoint_storage_token
|
||||
.clone()
|
||||
@@ -429,6 +432,7 @@ impl ComputeNode {
|
||||
state_changed: Condvar::new(),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
compute_ctl_config: config.compute_ctl_config,
|
||||
extension_stats_task: Mutex::new(None),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -516,6 +520,9 @@ impl ComputeNode {
|
||||
None
|
||||
};
|
||||
|
||||
// Terminate the extension stats collection task
|
||||
this.terminate_extension_stats_task();
|
||||
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
// /sys/fs/cgroup/neon-postgres.
|
||||
// Note: the vm-monitor only runs on linux because it requires cgroups.
|
||||
@@ -998,13 +1005,80 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
/// Fetches a basebackup from the Pageserver using the compute state's Pageserver connstring and
|
||||
/// unarchives it to `pgdata` directory, replacing any existing contents.
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let start_time = Instant::now();
|
||||
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let started = Instant::now();
|
||||
|
||||
let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
|
||||
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
|
||||
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
|
||||
};
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.pageserver_connect_micros =
|
||||
connected.duration_since(started).as_micros() as u64;
|
||||
state.metrics.basebackup_bytes = size as u64;
|
||||
state.metrics.basebackup_ms = started.elapsed().as_millis() as u64;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
|
||||
/// the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
let shard0_connstr = spec
|
||||
.pageserver_connstr
|
||||
.split(',')
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_string();
|
||||
let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
|
||||
0 | 1 => ShardIndex::unsharded(),
|
||||
count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
|
||||
};
|
||||
|
||||
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::Client::new(
|
||||
shard0_connstr,
|
||||
spec.tenant_id,
|
||||
spec.timeline_id,
|
||||
shard_index,
|
||||
spec.storage_auth_token.clone(),
|
||||
None, // NB: base backups use payload compression
|
||||
)
|
||||
.await?;
|
||||
let connected = Instant::now();
|
||||
let reader = client
|
||||
.get_base_backup(page_api::GetBaseBackupRequest {
|
||||
lsn: (lsn != Lsn(0)).then_some(lsn),
|
||||
compression: BaseBackupCompression::Gzip,
|
||||
replica: spec.spec.mode != ComputeMode::Primary,
|
||||
full: false,
|
||||
})
|
||||
.await?;
|
||||
anyhow::Ok((reader, connected))
|
||||
})?;
|
||||
|
||||
let mut reader = MeasuredReader::new(tokio_util::io::SyncIoBridge::new(reader));
|
||||
|
||||
// Set `ignore_zeros` so that unpack() reads the entire stream and doesn't just stop at the
|
||||
// end-of-archive marker. If the server errors, the tar::Builder drop handler will write an
|
||||
// end-of-archive marker before the error is emitted, and we would not see the error.
|
||||
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut reader));
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.params.pgdata)?;
|
||||
|
||||
Ok((connected, reader.get_byte_count()))
|
||||
}
|
||||
|
||||
/// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
|
||||
/// when the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let mut config = postgres::Config::from_str(shard0_connstr)?;
|
||||
|
||||
@@ -1018,16 +1092,14 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
config.application_name("compute_ctl");
|
||||
if let Some(spec) = &compute_state.pspec {
|
||||
config.options(&format!(
|
||||
"-c neon.compute_mode={}",
|
||||
spec.spec.mode.to_type_str()
|
||||
));
|
||||
}
|
||||
config.options(&format!(
|
||||
"-c neon.compute_mode={}",
|
||||
spec.spec.mode.to_type_str()
|
||||
));
|
||||
|
||||
// Connect to pageserver
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
|
||||
let connected = Instant::now();
|
||||
|
||||
let basebackup_cmd = match lsn {
|
||||
Lsn(0) => {
|
||||
@@ -1064,16 +1136,13 @@ impl ComputeNode {
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
// The tar::Builder drop handler will write an end-of-archive marker
|
||||
// before emitting the error, and we would not see it otherwise.
|
||||
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.params.pgdata)?;
|
||||
|
||||
// Report metrics
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.pageserver_connect_micros = pageserver_connect_micros;
|
||||
state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
|
||||
state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
|
||||
Ok(())
|
||||
Ok((connected, measured_reader.get_byte_count()))
|
||||
}
|
||||
|
||||
// Gets the basebackup in a retry loop
|
||||
@@ -1610,6 +1679,8 @@ impl ComputeNode {
|
||||
tls_config = self.compute_ctl_config.tls.clone();
|
||||
}
|
||||
|
||||
self.update_installed_extensions_collection_interval(&spec);
|
||||
|
||||
let max_concurrent_connections = self.max_service_connections(compute_state, &spec);
|
||||
|
||||
// Merge-apply spec & changes to PostgreSQL state.
|
||||
@@ -1674,6 +1745,8 @@ impl ComputeNode {
|
||||
|
||||
let tls_config = self.tls_config(&spec);
|
||||
|
||||
self.update_installed_extensions_collection_interval(&spec);
|
||||
|
||||
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
@@ -2278,10 +2351,20 @@ LIMIT 100",
|
||||
}
|
||||
|
||||
pub fn spawn_extension_stats_task(&self) {
|
||||
// Cancel any existing task
|
||||
if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
|
||||
handle.abort();
|
||||
}
|
||||
|
||||
let conf = self.tokio_conn_conf.clone();
|
||||
let installed_extensions_collection_interval =
|
||||
self.params.installed_extensions_collection_interval;
|
||||
tokio::spawn(async move {
|
||||
let atomic_interval = self.params.installed_extensions_collection_interval.clone();
|
||||
let mut installed_extensions_collection_interval =
|
||||
2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst);
|
||||
info!(
|
||||
"[NEON_EXT_SPAWN] Spawning background installed extensions worker with Timeout: {}",
|
||||
installed_extensions_collection_interval
|
||||
);
|
||||
let handle = tokio::spawn(async move {
|
||||
// An initial sleep is added to ensure that two collections don't happen at the same time.
|
||||
// The first collection happens during compute startup.
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(
|
||||
@@ -2294,8 +2377,48 @@ LIMIT 100",
|
||||
loop {
|
||||
interval.tick().await;
|
||||
let _ = installed_extensions(conf.clone()).await;
|
||||
// Acquire a read lock on the compute spec and then update the interval if necessary
|
||||
interval = tokio::time::interval(tokio::time::Duration::from_secs(std::cmp::max(
|
||||
installed_extensions_collection_interval,
|
||||
2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst),
|
||||
)));
|
||||
installed_extensions_collection_interval = interval.period().as_secs();
|
||||
}
|
||||
});
|
||||
|
||||
// Store the new task handle
|
||||
*self.extension_stats_task.lock().unwrap() = Some(handle);
|
||||
}
|
||||
|
||||
fn terminate_extension_stats_task(&self) {
|
||||
if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
|
||||
handle.abort();
|
||||
}
|
||||
}
|
||||
|
||||
fn update_installed_extensions_collection_interval(&self, spec: &ComputeSpec) {
|
||||
// Update the interval for collecting installed extensions statistics
|
||||
// If the value is -1, we never suspend so set the value to default collection.
|
||||
// If the value is 0, it means default, we will just continue to use the default.
|
||||
if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 {
|
||||
info!(
|
||||
"[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}",
|
||||
spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL
|
||||
);
|
||||
self.params.installed_extensions_collection_interval.store(
|
||||
DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL,
|
||||
std::sync::atomic::Ordering::SeqCst,
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
"[NEON_EXT_INT_UPD] Spec Timeout: {}",
|
||||
spec.suspend_timeout_seconds
|
||||
);
|
||||
self.params.installed_extensions_collection_interval.store(
|
||||
spec.suspend_timeout_seconds as u64,
|
||||
std::sync::atomic::Ordering::SeqCst,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,9 @@ use std::thread;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use compute_api::spec::{ComputeMode, PageserverProtocol};
|
||||
use itertools::Itertools as _;
|
||||
use pageserver_page_api as page_api;
|
||||
use postgres::{NoTls, SimpleQueryMessage};
|
||||
use tracing::{info, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -76,25 +78,17 @@ fn acquire_lsn_lease_with_retry(
|
||||
|
||||
loop {
|
||||
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
|
||||
let configs = {
|
||||
let (connstrings, auth) = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
|
||||
let spec = state.pspec.as_ref().expect("spec must be set");
|
||||
|
||||
let conn_strings = spec.pageserver_connstr.split(',');
|
||||
|
||||
conn_strings
|
||||
.map(|connstr| {
|
||||
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
|
||||
if let Some(storage_auth_token) = &spec.storage_auth_token {
|
||||
config.password(storage_auth_token.clone());
|
||||
}
|
||||
config
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
(
|
||||
spec.pageserver_connstr.clone(),
|
||||
spec.storage_auth_token.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
|
||||
let result =
|
||||
try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
|
||||
match result {
|
||||
Ok(Some(res)) => {
|
||||
return Ok(res);
|
||||
@@ -116,68 +110,104 @@ fn acquire_lsn_lease_with_retry(
|
||||
}
|
||||
}
|
||||
|
||||
/// Tries to acquire an LSN lease through PS page_service API.
|
||||
/// Tries to acquire LSN leases on all Pageserver shards.
|
||||
fn try_acquire_lsn_lease(
|
||||
connstrings: &str,
|
||||
auth: Option<&str>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
configs: &[postgres::Config],
|
||||
) -> Result<Option<SystemTime>> {
|
||||
fn get_valid_until(
|
||||
config: &postgres::Config,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} ");
|
||||
let res = client.simple_query(&cmd)?;
|
||||
let msg = match res.first() {
|
||||
Some(msg) => msg,
|
||||
None => bail!("empty response"),
|
||||
};
|
||||
let row = match msg {
|
||||
SimpleQueryMessage::Row(row) => row,
|
||||
_ => bail!("error parsing lsn lease response"),
|
||||
let connstrings = connstrings.split(',').collect_vec();
|
||||
let shard_count = connstrings.len();
|
||||
let mut leases = Vec::new();
|
||||
|
||||
for (shard_number, &connstring) in connstrings.iter().enumerate() {
|
||||
let tenant_shard_id = match shard_count {
|
||||
0 | 1 => TenantShardId::unsharded(tenant_id),
|
||||
shard_count => TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(shard_number as u8),
|
||||
shard_count: ShardCount::new(shard_count as u8),
|
||||
},
|
||||
};
|
||||
|
||||
// Note: this will be None if a lease is explicitly not granted.
|
||||
let valid_until_str = row.get("valid_until");
|
||||
|
||||
let valid_until = valid_until_str.map(|s| {
|
||||
SystemTime::UNIX_EPOCH
|
||||
.checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
|
||||
.expect("Time larger than max SystemTime could handle")
|
||||
});
|
||||
Ok(valid_until)
|
||||
let lease = match PageserverProtocol::from_connstring(connstring)? {
|
||||
PageserverProtocol::Libpq => {
|
||||
acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
|
||||
}
|
||||
PageserverProtocol::Grpc => {
|
||||
acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
|
||||
}
|
||||
};
|
||||
leases.push(lease);
|
||||
}
|
||||
|
||||
let shard_count = configs.len();
|
||||
Ok(leases.into_iter().min().flatten())
|
||||
}
|
||||
|
||||
let valid_until = if shard_count > 1 {
|
||||
configs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(shard_number, config)| {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_count: ShardCount::new(shard_count as u8),
|
||||
shard_number: ShardNumber(shard_number as u8),
|
||||
};
|
||||
get_valid_until(config, tenant_shard_id, timeline_id, lsn)
|
||||
})
|
||||
.collect::<Result<Vec<Option<SystemTime>>>>()?
|
||||
.into_iter()
|
||||
.min()
|
||||
.unwrap()
|
||||
} else {
|
||||
get_valid_until(
|
||||
&configs[0],
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?
|
||||
/// Acquires an LSN lease on a single shard, using the libpq API. The connstring must use a
|
||||
/// postgresql:// scheme.
|
||||
fn acquire_lsn_lease_libpq(
|
||||
connstring: &str,
|
||||
auth: Option<&str>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
let mut config = postgres::Config::from_str(connstring)?;
|
||||
if let Some(auth) = auth {
|
||||
config.password(auth);
|
||||
}
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} ");
|
||||
let res = client.simple_query(&cmd)?;
|
||||
let msg = match res.first() {
|
||||
Some(msg) => msg,
|
||||
None => bail!("empty response"),
|
||||
};
|
||||
let row = match msg {
|
||||
SimpleQueryMessage::Row(row) => row,
|
||||
_ => bail!("error parsing lsn lease response"),
|
||||
};
|
||||
|
||||
// Note: this will be None if a lease is explicitly not granted.
|
||||
let valid_until_str = row.get("valid_until");
|
||||
|
||||
let valid_until = valid_until_str.map(|s| {
|
||||
SystemTime::UNIX_EPOCH
|
||||
.checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
|
||||
.expect("Time larger than max SystemTime could handle")
|
||||
});
|
||||
Ok(valid_until)
|
||||
}
|
||||
|
||||
/// Acquires an LSN lease on a single shard, using the gRPC API. The connstring must use a
|
||||
/// grpc:// scheme.
|
||||
fn acquire_lsn_lease_grpc(
|
||||
connstring: &str,
|
||||
auth: Option<&str>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::Client::new(
|
||||
connstring.to_string(),
|
||||
tenant_shard_id.tenant_id,
|
||||
timeline_id,
|
||||
tenant_shard_id.to_index(),
|
||||
auth.map(String::from),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let req = page_api::LeaseLsnRequest { lsn };
|
||||
match client.lease_lsn(req).await {
|
||||
Ok(expires) => Ok(Some(expires)),
|
||||
// Lease couldn't be acquired because the LSN has been garbage collected.
|
||||
Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None),
|
||||
Err(err) => Err(err.into()),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
|
||||
"timestamp": "2021-05-23T18:25:43.511Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
|
||||
|
||||
"suspend_timeout_seconds": 3600,
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "test-cluster-42",
|
||||
"name": "Zenith Test",
|
||||
|
||||
@@ -16,9 +16,9 @@ use std::time::Duration;
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use clap::Parser;
|
||||
use compute_api::requests::ComputeClaimsScope;
|
||||
use compute_api::spec::ComputeMode;
|
||||
use compute_api::spec::{ComputeMode, PageserverProtocol};
|
||||
use control_plane::broker::StorageBroker;
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode, PageserverProtocol};
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::{
|
||||
@@ -1649,7 +1649,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = parse_safekeepers(&args.safekeepers)?;
|
||||
endpoint.reconfigure(pageservers, None, safekeepers).await?;
|
||||
endpoint
|
||||
.reconfigure(Some(pageservers), None, safekeepers, None)
|
||||
.await?;
|
||||
}
|
||||
EndpointCmd::Stop(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
|
||||
@@ -56,8 +56,8 @@ use compute_api::responses::{
|
||||
TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
|
||||
RemoteExtSpec, Role,
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
|
||||
PgIdent, RemoteExtSpec, Role,
|
||||
};
|
||||
use jsonwebtoken::jwk::{
|
||||
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
|
||||
@@ -373,29 +373,6 @@ impl std::fmt::Display for EndpointTerminateMode {
|
||||
}
|
||||
}
|
||||
|
||||
/// Protocol used to connect to a Pageserver.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum PageserverProtocol {
|
||||
Libpq,
|
||||
Grpc,
|
||||
}
|
||||
|
||||
impl PageserverProtocol {
|
||||
/// Returns the URL scheme for the protocol, used in connstrings.
|
||||
pub fn scheme(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Libpq => "postgresql",
|
||||
Self::Grpc => "grpc",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for PageserverProtocol {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.scheme())
|
||||
}
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
|
||||
if !entry.file_type()?.is_dir() {
|
||||
@@ -803,6 +780,7 @@ impl Endpoint {
|
||||
endpoint_storage_addr: Some(endpoint_storage_addr),
|
||||
endpoint_storage_token: Some(endpoint_storage_token),
|
||||
autoprewarm: false,
|
||||
suspend_timeout_seconds: -1, // Only used in neon_local.
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
@@ -997,12 +975,11 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure(
|
||||
&self,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
safekeeper_generation: Option<SafekeeperGeneration>,
|
||||
) -> Result<()> {
|
||||
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
|
||||
|
||||
let (mut spec, compute_ctl_config) = {
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
let file = std::fs::File::open(config_path)?;
|
||||
@@ -1014,16 +991,24 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
// If pageservers are not specified, don't change them.
|
||||
if let Some(pageservers) = pageservers {
|
||||
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
}
|
||||
}
|
||||
|
||||
// If safekeepers are not specified, don't change them.
|
||||
if let Some(safekeepers) = safekeepers {
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
|
||||
spec.safekeeper_connstrings = safekeeper_connstrings;
|
||||
if let Some(g) = safekeeper_generation {
|
||||
spec.safekeepers_generation = Some(g.into_inner());
|
||||
}
|
||||
}
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
@@ -1061,6 +1046,24 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn reconfigure_pageservers(
|
||||
&self,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> Result<()> {
|
||||
self.reconfigure(Some(pageservers), stripe_size, None, None)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn reconfigure_safekeepers(
|
||||
&self,
|
||||
safekeepers: Vec<NodeId>,
|
||||
generation: SafekeeperGeneration,
|
||||
) -> Result<()> {
|
||||
self.reconfigure(None, None, Some(safekeepers), Some(generation))
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn stop(
|
||||
&self,
|
||||
mode: EndpointTerminateMode,
|
||||
|
||||
@@ -12,6 +12,7 @@ use std::{env, fs};
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
use clap::ValueEnum;
|
||||
use pageserver_api::config::PostHogConfig;
|
||||
use pem::Pem;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::{Certificate, Url};
|
||||
@@ -211,7 +212,9 @@ pub struct NeonStorageControllerConf {
|
||||
|
||||
pub use_local_compute_notifications: bool,
|
||||
|
||||
pub timeline_safekeeper_count: Option<i64>,
|
||||
pub timeline_safekeeper_count: Option<usize>,
|
||||
|
||||
pub posthog_config: Option<PostHogConfig>,
|
||||
|
||||
pub kick_secondary_downloads: Option<bool>,
|
||||
}
|
||||
@@ -245,6 +248,7 @@ impl Default for NeonStorageControllerConf {
|
||||
use_https_safekeeper_api: false,
|
||||
use_local_compute_notifications: true,
|
||||
timeline_safekeeper_count: None,
|
||||
posthog_config: None,
|
||||
kick_secondary_downloads: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -638,10 +638,28 @@ impl StorageController {
|
||||
args.push("--timelines-onto-safekeepers".to_string());
|
||||
}
|
||||
|
||||
if let Some(sk_cnt) = self.config.timeline_safekeeper_count {
|
||||
// neon_local is used in test environments where we often have less than 3 safekeepers.
|
||||
if self.config.timeline_safekeeper_count.is_some() || self.env.safekeepers.len() < 3 {
|
||||
let sk_cnt = self
|
||||
.config
|
||||
.timeline_safekeeper_count
|
||||
.unwrap_or(self.env.safekeepers.len());
|
||||
|
||||
args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
|
||||
}
|
||||
|
||||
let mut envs = vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
];
|
||||
|
||||
if let Some(posthog_config) = &self.config.posthog_config {
|
||||
envs.push((
|
||||
"POSTHOG_CONFIG".to_string(),
|
||||
serde_json::to_string(posthog_config)?,
|
||||
));
|
||||
}
|
||||
|
||||
println!("Starting storage controller");
|
||||
|
||||
background_process::start_process(
|
||||
@@ -649,10 +667,7 @@ impl StorageController {
|
||||
&instance_dir,
|
||||
&self.env.storage_controller_bin(),
|
||||
args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
envs,
|
||||
background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
|
||||
&start_args.start_timeout,
|
||||
|| async {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
"timestamp": "2022-10-12T18:00:00.000Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
|
||||
"suspend_timeout_seconds": -1,
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "docker_compose",
|
||||
|
||||
396
docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md
Normal file
396
docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md
Normal file
@@ -0,0 +1,396 @@
|
||||
# Memo: Endpoint Persistent Unlogged Files Storage
|
||||
Created on 2024-11-05
|
||||
Implemented on N/A
|
||||
|
||||
## Summary
|
||||
A design for a storage system that allows storage of files required to make
|
||||
Neon's Endpoints have a better experience at or after a reboot.
|
||||
|
||||
## Motivation
|
||||
Several systems inside PostgreSQL (and Neon) need some persistent storage for
|
||||
optimal workings across reboots and restarts, but still work without.
|
||||
Examples are the query-level statistics files of `pg_stat_statements` in
|
||||
`pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`.
|
||||
We need a storage system that can store and manage these files for each
|
||||
Endpoint, without necessarily granting users access to an unlimited storage
|
||||
device.
|
||||
|
||||
## Goals
|
||||
- Store known files for Endpoints with reasonable persistence.
|
||||
_Data loss in this service, while annoying and bad for UX, won't lose any
|
||||
customer's data._
|
||||
|
||||
## Non Goals (if relevant)
|
||||
- This storage system does not need branching, file versioning, or other such
|
||||
features. The files are as ephemeral to the timeline of the data as the
|
||||
Endpoints that host the data.
|
||||
- This storage system does not need to store _all_ user files, only 'known'
|
||||
user files.
|
||||
- This storage system does not need to be hosted fully inside Computes.
|
||||
_Instead, this will be a separate component similar to Pageserver,
|
||||
SafeKeeper, the S3 proxy used for dynamically loaded extensions, etc._
|
||||
|
||||
## Impacted components
|
||||
- Compute needs new code to load and store these files in its lifetime.
|
||||
- Control Plane needs to consider this new storage system when signalling
|
||||
the deletion of an Endpoint, Timeline, or Tenant.
|
||||
- Control Plane needs to consider this new storage system when it resets
|
||||
or re-assigns an endpoint's timeline/branch state.
|
||||
|
||||
A new service is created: the Endpoint Persistent Unlogged Files Storage
|
||||
service. This could be integrated in e.g. Pageserver or Control Plane, or a
|
||||
separately hosted service.
|
||||
|
||||
## Proposed implementation
|
||||
Endpoint-related data files are managed by a newly designed service (which
|
||||
optionally is integrated in an existing service like Pageserver or Control
|
||||
Plane), which stores data directly into S3 or any blob storage of choice.
|
||||
|
||||
Upon deletion of the Endpoint, or reassignment of the endpoint to a different
|
||||
branch, this ephemeral data is dropped: the data stored may not match the
|
||||
state of the branch's data after reassignment, and on endpoint deletion the
|
||||
data won't have any use to the user.
|
||||
|
||||
Compute gets credentials (JWT token with Tenant, Timeline & Endpoint claims)
|
||||
which it can use to authenticate to this new service and retrieve and store
|
||||
data associated with this endpoint. This limited scope reduces leaks of data
|
||||
across endpoints and timeline resets, and limits the ability of endpoints to
|
||||
mess with other endpoints' data.
|
||||
|
||||
The path of this endpoint data in S3 is initially as follows:
|
||||
|
||||
s3://<regional-epufs-bucket>/
|
||||
tenants/
|
||||
<hex-tenant-id>/
|
||||
tenants/
|
||||
<hex-timeline-id>/
|
||||
endpoints/
|
||||
<endpoint-id>/
|
||||
pgdata/
|
||||
<file_path_in_pgdatadir>
|
||||
|
||||
For other blob storages an equivalent or similar path can be constructed.
|
||||
|
||||
### Reliability, failure modes and corner cases (if relevant)
|
||||
Reliability is important, but not critical to the workings of Neon. The data
|
||||
stored in this service will, when lost, reduce performance, but won't be a
|
||||
cause of permanent data loss - only operational metadata is stored.
|
||||
|
||||
Most, if not all, blob storage services have sufficiently high persistence
|
||||
guarantees to cater our need for persistence and uptime. The only concern with
|
||||
blob storages is that the access latency is generally higher than local disk,
|
||||
but for the object types stored (cache state, ...) I don't think this will be
|
||||
much of an issue.
|
||||
|
||||
### Interaction/Sequence diagram (if relevant)
|
||||
|
||||
In these diagrams you can replace S3 with any persistent storage device of
|
||||
choice, but S3 is chosen as representative name: The well-known and short name
|
||||
of AWS' blob storage. Azure Blob Storage should work too, but it has a much
|
||||
longer name making it less practical for the diagrams.
|
||||
|
||||
Write data:
|
||||
|
||||
```http
|
||||
POST /tenants/<tenant-id>/timelines/<tl-id>/endpoints/<endpoint-id>/pgdata/<the-pgdata-path>
|
||||
Host: epufs.svc.neon.local
|
||||
|
||||
<<<
|
||||
|
||||
200 OK
|
||||
{
|
||||
"version": "<opaque>", # opaque file version token, changes when the file contents change
|
||||
"size": <bytes>,
|
||||
}
|
||||
```
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant co as Compute
|
||||
participant ep as EPUFS
|
||||
participant s3 as Blob Storage
|
||||
|
||||
co-->ep: Connect with credentials
|
||||
co->>+ep: Store Unlogged Persistent File
|
||||
opt is authenticated
|
||||
ep->>s3: Write UPF to S3
|
||||
end
|
||||
ep->>-co: OK / Failure / Auth Failure
|
||||
co-->ep: Cancel connection
|
||||
```
|
||||
|
||||
Read data: (optional with cache-relevant request parameters, e.g. If-Modified-Since)
|
||||
```http
|
||||
GET /tenants/<tenant-id>/timelines/<tl-id>/endpoints/<endpoint-id>/pgdata/<the-pgdata-path>
|
||||
Host: epufs.svc.neon.local
|
||||
|
||||
<<<
|
||||
|
||||
200 OK
|
||||
|
||||
<file data>
|
||||
```
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant co as Compute
|
||||
participant ep as EPUFS
|
||||
participant s3 as Blob Storage
|
||||
|
||||
co->>+ep: Read Unlogged Persistent File
|
||||
opt is authenticated
|
||||
ep->>+s3: Request UPF from storage
|
||||
s3->>-ep: Receive UPF from storage
|
||||
end
|
||||
ep->>-co: OK(response) / Failure(storage, auth, ...)
|
||||
```
|
||||
|
||||
Compute Startup:
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant co as Compute
|
||||
participant ps as Pageserver
|
||||
participant ep as EPUFS
|
||||
participant es as Extension server
|
||||
|
||||
note over co: Bind endpoint ep-xxx
|
||||
par Get basebackup
|
||||
co->>+ps: Request basebackup @ LSN
|
||||
ps-)ps: Construct basebackup
|
||||
ps->>-co: Receive basebackup TAR @ LSN
|
||||
and Get startup-critical Unlogged Persistent Files
|
||||
co->>+ep: Get all UPFs of endpoint ep-xxx
|
||||
ep-)ep: Retrieve and gather all UPFs
|
||||
ep->>-co: TAR of UPFs
|
||||
and Get startup-critical extensions
|
||||
loop For every startup-critical extension
|
||||
co->>es: Get critical extension
|
||||
es->>co: Receive critical extension
|
||||
end
|
||||
end
|
||||
note over co: Start compute
|
||||
```
|
||||
|
||||
CPlane ops:
|
||||
```http
|
||||
DELETE /tenants/<tenant-id>/timelines/<timeline-id>/endpoints/<endpoint-id>
|
||||
Host: epufs.svc.neon.local
|
||||
|
||||
<<<
|
||||
|
||||
200 OK
|
||||
{
|
||||
"tenant": "<tenant-id>",
|
||||
"timeline": "<timeline-id>",
|
||||
"endpoint": "<endpoint-id>",
|
||||
"deleted": {
|
||||
"files": <count>,
|
||||
"bytes": <count>,
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
DELETE /tenants/<tenant-id>/timelines/<timeline-id>
|
||||
Host: epufs.svc.neon.local
|
||||
|
||||
<<<
|
||||
|
||||
200 OK
|
||||
{
|
||||
"tenant": "<tenant-id>",
|
||||
"timeline": "<timeline-id>",
|
||||
"deleted": {
|
||||
"files": <count>,
|
||||
"bytes": <count>,
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
```http
|
||||
DELETE /tenants/<tenant-id>
|
||||
Host: epufs.svc.neon.local
|
||||
|
||||
<<<
|
||||
|
||||
200 OK
|
||||
{
|
||||
"tenant": "<tenant-id>",
|
||||
"deleted": {
|
||||
"files": <count>,
|
||||
"bytes": <count>,
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant cp as Control Plane
|
||||
participant ep as EPUFS
|
||||
participant s3 as Blob Storage
|
||||
|
||||
alt Tenant deleted
|
||||
cp-)ep: Tenant deleted
|
||||
loop For every object associated with removed tenant
|
||||
ep->>s3: Remove data of deleted tenant from Storage
|
||||
end
|
||||
opt
|
||||
ep-)cp: Tenant cleanup complete
|
||||
end
|
||||
alt Timeline deleted
|
||||
cp-)ep: Timeline deleted
|
||||
loop For every object associated with removed timeline
|
||||
ep->>s3: Remove data of deleted timeline from Storage
|
||||
end
|
||||
opt
|
||||
ep-)cp: Timeline cleanup complete
|
||||
end
|
||||
else Endpoint reassigned or removed
|
||||
cp->>+ep: Endpoint reassigned
|
||||
loop For every object associated with reassigned/removed endpoint
|
||||
ep->>s3: Remove data from Storage
|
||||
end
|
||||
ep->>-cp: Cleanup complete
|
||||
end
|
||||
```
|
||||
|
||||
### Scalability (if relevant)
|
||||
|
||||
Provisionally: As this service is going to be part of compute startup, this
|
||||
service should be able to quickly respond to all requests. Therefore this
|
||||
service is deployed to every AZ we host Computes in, and Computes communicate
|
||||
(generally) only to the EPUFS endpoint of the AZ they're hosted in.
|
||||
|
||||
Local caching of frequently restarted endpoints' data or metadata may be
|
||||
needed for best performance. However, due to the regional nature of stored
|
||||
data but zonal nature of the service deployment, we should be careful when we
|
||||
implement any local caching, as it is possible that computes in AZ 1 will
|
||||
update data originally written and thus cached by AZ 2. Cache version tests
|
||||
and invalidation is therefore required if we want to roll out caching to this
|
||||
service, which is too broad a scope for an MVC. This is why caching is left
|
||||
out of scope for this RFC, and should be considered separately after this RFC
|
||||
is implemented.
|
||||
|
||||
### Security implications (if relevant)
|
||||
This service must be able to authenticate users at least by Tenant ID,
|
||||
Timeline ID and Endpoint ID. This will use the existing JWT infrastructure of
|
||||
Compute, which will be upgraded to the extent needed to support Timeline- and
|
||||
Endpoint-based claims.
|
||||
|
||||
The service requires unlimited access to (a prefix of) a blob storage bucket,
|
||||
and thus must be hosted outside the Compute VM sandbox.
|
||||
|
||||
A service that generates pre-signed request URLs for Compute to download the
|
||||
data from that URL is likely problematic, too: Compute would be able to write
|
||||
unlimited data to the bucket, or exfiltrate this signed URL to get read/write
|
||||
access to specific objects in this bucket, which would still effectively give
|
||||
users access to the S3 bucket (but with improved access logging).
|
||||
|
||||
There may be a use case for transferring data associated with one endpoint to
|
||||
another endpoint (e.g. to make one endpoint warm its caches with the state of
|
||||
another endpoint), but that's not currently in scope, and specific needs may
|
||||
be solved through out-of-line communication of data or pre-signed URLs.
|
||||
|
||||
### Unresolved questions (if relevant)
|
||||
Caching of files is not in the implementation scope of the document, but
|
||||
should at some future point be considered to maximize performance.
|
||||
|
||||
## Alternative implementation (if relevant)
|
||||
Several ideas have come up to solve this issue:
|
||||
|
||||
### Use AUXfile
|
||||
One prevalent idea was to WAL-log the files using our AUXfile mechanism.
|
||||
|
||||
Benefits:
|
||||
|
||||
+ We already have this storage mechanism
|
||||
|
||||
Demerits:
|
||||
|
||||
- It isn't available on read replicas
|
||||
- Additional WAL will be consumed during shutdown and after the shutdown
|
||||
checkpoint, which needs PG modifications to work without panics.
|
||||
- It increases the data we need to manage in our versioned storage, thus
|
||||
causing higher storage costs with higher retention due to duplication at
|
||||
the storage layer.
|
||||
|
||||
### Sign URLs for read/write operations, instead of proxying them
|
||||
|
||||
Benefits:
|
||||
|
||||
+ The service can be implemented with a much reduced IO budget
|
||||
|
||||
Demerits:
|
||||
|
||||
- Users could get access to these signed credentials
|
||||
- Not all blob storage services may implement URL signing
|
||||
|
||||
### Give endpoints each their own directly accessed block volume
|
||||
|
||||
Benefits:
|
||||
|
||||
+ Easier to integrate for PostgreSQL
|
||||
|
||||
Demerits:
|
||||
|
||||
- Little control on data size and contents
|
||||
- Potentially problematic as we'd need to store data all across the pgdata
|
||||
directory.
|
||||
- EBS is not a good candidate
|
||||
- Attaches in 10s of seconds, if not more; i.e. too cold to start
|
||||
- Shared EBS volumes are a no-go, as you'd have to schedule the endpoint
|
||||
with users of the same EBS volumes, which can't work with VM migration
|
||||
- EBS storage costs are very high (>80$/kilotenant when using a
|
||||
volume/tenant)
|
||||
- EBS volumes can't be mounted across AZ boundaries
|
||||
- Bucket per endpoint is unfeasible
|
||||
- S3 buckets are priced at $20/month per 1k, which we could better spend
|
||||
on developers.
|
||||
- Allocating service accounts takes time (100s of ms), and service accounts
|
||||
are a limited resource, too; so they're not a good candidate to allocate
|
||||
on a per-endpoint basis.
|
||||
- Giving credentials limited to prefix has similar issues as the pre-signed
|
||||
URL approach.
|
||||
- Bucket DNS lookup will fill DNS caches and put pressure on DNS lookup
|
||||
much more than our current systems would.
|
||||
- Volumes bound by hypervisor are unlikely
|
||||
- This requires significant investment and increased software on the
|
||||
hypervisor.
|
||||
- It is unclear if we can attach volumes after boot, i.e. for pooled
|
||||
instances.
|
||||
|
||||
### Put the files into a table
|
||||
|
||||
Benefits:
|
||||
|
||||
+ Mostly already available in PostgreSQL
|
||||
|
||||
Demerits:
|
||||
|
||||
- Uses WAL
|
||||
- Can't be used after shutdown checkpoint
|
||||
- Needs a RW endpoint, and table & catalog access to write to this data
|
||||
- Gets hit with DB size limitations
|
||||
- Depending on user acces:
|
||||
- Inaccessible:
|
||||
The user doesn't have control over database size caused by
|
||||
these systems.
|
||||
- Accessible:
|
||||
The user can corrupt these files and cause the system to crash while
|
||||
user-corrupted files are present, thus increasing on-call overhead.
|
||||
|
||||
## Definition of Done (if relevant)
|
||||
|
||||
This project is done if we have:
|
||||
|
||||
- One S3 bucket equivalent per region, which stores this per-endpoint data.
|
||||
- A new service endpoint in at least every AZ, which indirectly grants
|
||||
endpoints access to the data stored for these endpoints in these buckets.
|
||||
- Compute writes & reads temp-data at shutdown and startup, respectively, for
|
||||
at least the pg_prewarm or lfc_prewarm state files.
|
||||
- Cleanup of endpoint data is triggered when the endpoint is deleted or is
|
||||
detached from its current timeline.
|
||||
@@ -12,6 +12,7 @@ jsonwebtoken.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
regex.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
utils = { path = "../utils" }
|
||||
remote_storage = { version = "0.1", path = "../remote_storage/" }
|
||||
|
||||
@@ -4,11 +4,14 @@
|
||||
//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
|
||||
//! compute_ctl can fetch it by calling the control plane's API.
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use indexmap::IndexMap;
|
||||
use regex::Regex;
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -181,6 +184,11 @@ pub struct ComputeSpec {
|
||||
/// Download LFC state from endpoint_storage and pass it to Postgres on startup
|
||||
#[serde(default)]
|
||||
pub autoprewarm: bool,
|
||||
|
||||
/// Suspend timeout in seconds.
|
||||
///
|
||||
/// We use this value to derive other values, such as the installed extensions metric.
|
||||
pub suspend_timeout_seconds: i64,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
@@ -429,6 +437,47 @@ pub struct JwksSettings {
|
||||
pub jwt_audience: Option<String>,
|
||||
}
|
||||
|
||||
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub enum PageserverProtocol {
|
||||
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
|
||||
#[default]
|
||||
Libpq,
|
||||
/// A newer, gRPC-based protocol. Uses grpc:// scheme.
|
||||
Grpc,
|
||||
}
|
||||
|
||||
impl PageserverProtocol {
|
||||
/// Parses the protocol from a connstring scheme. Defaults to Libpq if no scheme is given.
|
||||
/// Errors if the connstring is an invalid URL.
|
||||
pub fn from_connstring(connstring: &str) -> anyhow::Result<Self> {
|
||||
let scheme = match Url::parse(connstring) {
|
||||
Ok(url) => url.scheme().to_lowercase(),
|
||||
Err(url::ParseError::RelativeUrlWithoutBase) => return Ok(Self::default()),
|
||||
Err(err) => return Err(anyhow!("invalid connstring URL: {err}")),
|
||||
};
|
||||
match scheme.as_str() {
|
||||
"postgresql" | "postgres" => Ok(Self::Libpq),
|
||||
"grpc" => Ok(Self::Grpc),
|
||||
scheme => Err(anyhow!("invalid protocol scheme: {scheme}")),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the URL scheme for the protocol, for use in connstrings.
|
||||
pub fn scheme(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Libpq => "postgresql",
|
||||
Self::Grpc => "grpc",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for PageserverProtocol {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.scheme())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::File;
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
"timestamp": "2021-05-23T18:25:43.511Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
|
||||
"suspend_timeout_seconds": 3600,
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "test-cluster-42",
|
||||
|
||||
@@ -19,6 +19,7 @@ byteorder.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi_types.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
posthog_client_lite.workspace = true
|
||||
enum-map.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
@@ -29,12 +30,13 @@ humantime-serde.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
itertools.workspace = true
|
||||
storage_broker.workspace = true
|
||||
camino = {workspace = true, features = ["serde1"]}
|
||||
camino = { workspace = true, features = ["serde1"] }
|
||||
remote_storage.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
nix = {workspace = true, optional = true}
|
||||
nix = { workspace = true, optional = true }
|
||||
reqwest.workspace = true
|
||||
rand.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ use camino::Utf8PathBuf;
|
||||
mod tests;
|
||||
|
||||
use const_format::formatcp;
|
||||
use posthog_client_lite::PostHogClientConfig;
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
@@ -63,25 +64,66 @@ impl Display for NodeMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
/// PostHog integration config.
|
||||
/// PostHog integration config. This is used in pageserver, storcon, and neon_local.
|
||||
/// Ensure backward compatibility when adding new fields.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct PostHogConfig {
|
||||
/// PostHog project ID
|
||||
pub project_id: String,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub project_id: Option<String>,
|
||||
/// Server-side (private) API key
|
||||
pub server_api_key: String,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub server_api_key: Option<String>,
|
||||
/// Client-side (public) API key
|
||||
pub client_api_key: String,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub client_api_key: Option<String>,
|
||||
/// Private API URL
|
||||
pub private_api_url: String,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub private_api_url: Option<String>,
|
||||
/// Public API URL
|
||||
pub public_api_url: String,
|
||||
/// Refresh interval for the feature flag spec
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub public_api_url: Option<String>,
|
||||
/// Refresh interval for the feature flag spec.
|
||||
/// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive
|
||||
/// the spec for `refresh_interval`, it will fetch the spec from the PostHog API.
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub refresh_interval: Option<Duration>,
|
||||
}
|
||||
|
||||
impl PostHogConfig {
|
||||
pub fn try_into_posthog_config(self) -> Result<PostHogClientConfig, &'static str> {
|
||||
let Some(project_id) = self.project_id else {
|
||||
return Err("project_id is required");
|
||||
};
|
||||
let Some(server_api_key) = self.server_api_key else {
|
||||
return Err("server_api_key is required");
|
||||
};
|
||||
let Some(client_api_key) = self.client_api_key else {
|
||||
return Err("client_api_key is required");
|
||||
};
|
||||
let Some(private_api_url) = self.private_api_url else {
|
||||
return Err("private_api_url is required");
|
||||
};
|
||||
let Some(public_api_url) = self.public_api_url else {
|
||||
return Err("public_api_url is required");
|
||||
};
|
||||
Ok(PostHogClientConfig {
|
||||
project_id,
|
||||
server_api_key,
|
||||
client_api_key,
|
||||
private_api_url,
|
||||
public_api_url,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// `pageserver.toml`
|
||||
///
|
||||
/// We use serde derive with `#[serde(default)]` to generate a deserializer
|
||||
@@ -367,6 +409,9 @@ pub struct BasebackupCacheConfig {
|
||||
// TODO(diko): support max_entry_size_bytes.
|
||||
// pub max_entry_size_bytes: u64,
|
||||
pub max_size_entries: usize,
|
||||
/// Size of the channel used to send prepare requests to the basebackup cache worker.
|
||||
/// If exceeded, new prepare requests will be dropped.
|
||||
pub prepare_channel_size: usize,
|
||||
}
|
||||
|
||||
impl Default for BasebackupCacheConfig {
|
||||
@@ -375,7 +420,8 @@ impl Default for BasebackupCacheConfig {
|
||||
cleanup_period: Duration::from_secs(60),
|
||||
max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB
|
||||
// max_entry_size_bytes: 16 * 1024 * 1024, // 16 MiB
|
||||
max_size_entries: 1000,
|
||||
max_size_entries: 10000,
|
||||
prepare_channel_size: 100,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -546,6 +546,11 @@ pub struct TimelineImportRequest {
|
||||
pub sk_set: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, serde::Deserialize, Clone)]
|
||||
pub struct TimelineSafekeeperMigrateRequest {
|
||||
pub new_sk_set: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use serde_json;
|
||||
|
||||
@@ -21,7 +21,9 @@ use utils::{completion, serde_system_time};
|
||||
|
||||
use crate::config::Ratio;
|
||||
use crate::key::{CompactKey, Key};
|
||||
use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
use crate::shard::{
|
||||
DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
///
|
||||
@@ -475,7 +477,7 @@ pub struct TenantShardSplitResponse {
|
||||
}
|
||||
|
||||
/// Parameters that apply to all shards in a tenant. Used during tenant creation.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct ShardParameters {
|
||||
pub count: ShardCount,
|
||||
@@ -497,6 +499,15 @@ impl Default for ShardParameters {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ShardIdentity> for ShardParameters {
|
||||
fn from(identity: ShardIdentity) -> Self {
|
||||
Self {
|
||||
count: identity.count,
|
||||
stripe_size: identity.stripe_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Eq, PartialEq)]
|
||||
pub enum FieldPatch<T> {
|
||||
Upsert(T),
|
||||
|
||||
@@ -37,6 +37,7 @@ use std::hash::{Hash, Hasher};
|
||||
pub use ::utils::shard::*;
|
||||
use postgres_ffi_types::forknum::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::critical;
|
||||
|
||||
use crate::key::Key;
|
||||
use crate::models::ShardParameters;
|
||||
@@ -179,7 +180,7 @@ impl ShardIdentity {
|
||||
|
||||
/// For use when creating ShardIdentity instances for new shards, where a creation request
|
||||
/// specifies the ShardParameters that apply to all shards.
|
||||
pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
|
||||
pub fn from_params(number: ShardNumber, params: ShardParameters) -> Self {
|
||||
Self {
|
||||
number,
|
||||
count: params.count,
|
||||
@@ -188,6 +189,17 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Asserts that the given shard identities are equal. Changes to shard parameters will likely
|
||||
/// result in data corruption.
|
||||
pub fn assert_equal(&self, other: ShardIdentity) {
|
||||
if self != &other {
|
||||
// TODO: for now, we're conservative and just log errors in production. Turn this into a
|
||||
// real assertion when we're confident it doesn't misfire, and also reject requests that
|
||||
// attempt to change it with an error response.
|
||||
critical!("shard identity mismatch: {self:?} != {other:?}");
|
||||
}
|
||||
}
|
||||
|
||||
fn is_broken(&self) -> bool {
|
||||
self.layout == LAYOUT_BROKEN
|
||||
}
|
||||
|
||||
@@ -1,17 +1,22 @@
|
||||
//! A background loop that fetches feature flags from PostHog and updates the feature store.
|
||||
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use std::{
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, info_span};
|
||||
|
||||
use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig};
|
||||
use crate::{
|
||||
CaptureEvent, FeatureStore, LocalEvaluationResponse, PostHogClient, PostHogClientConfig,
|
||||
};
|
||||
|
||||
/// A background loop that fetches feature flags from PostHog and updates the feature store.
|
||||
pub struct FeatureResolverBackgroundLoop {
|
||||
posthog_client: PostHogClient,
|
||||
feature_store: ArcSwap<FeatureStore>,
|
||||
feature_store: ArcSwap<(SystemTime, Arc<FeatureStore>)>,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
@@ -19,11 +24,35 @@ impl FeatureResolverBackgroundLoop {
|
||||
pub fn new(config: PostHogClientConfig, shutdown_pageserver: CancellationToken) -> Self {
|
||||
Self {
|
||||
posthog_client: PostHogClient::new(config),
|
||||
feature_store: ArcSwap::new(Arc::new(FeatureStore::new())),
|
||||
feature_store: ArcSwap::new(Arc::new((
|
||||
SystemTime::UNIX_EPOCH,
|
||||
Arc::new(FeatureStore::new()),
|
||||
))),
|
||||
cancel: shutdown_pageserver,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the feature store with a new feature flag spec bypassing the normal refresh loop.
|
||||
pub fn update(&self, spec: String) -> anyhow::Result<()> {
|
||||
let resp: LocalEvaluationResponse = serde_json::from_str(&spec)?;
|
||||
self.update_feature_store_nofail(resp, "http_propagate");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_feature_store_nofail(&self, resp: LocalEvaluationResponse, source: &'static str) {
|
||||
let project_id = self.posthog_client.config.project_id.parse::<u64>().ok();
|
||||
match FeatureStore::new_with_flags(resp.flags, project_id) {
|
||||
Ok(feature_store) => {
|
||||
self.feature_store
|
||||
.store(Arc::new((SystemTime::now(), Arc::new(feature_store))));
|
||||
tracing::info!("Feature flag updated from {}", source);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Cannot process feature flag spec from {}: {}", source, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn(
|
||||
self: Arc<Self>,
|
||||
handle: &tokio::runtime::Handle,
|
||||
@@ -47,6 +76,17 @@ impl FeatureResolverBackgroundLoop {
|
||||
_ = ticker.tick() => {}
|
||||
_ = cancel.cancelled() => break
|
||||
}
|
||||
{
|
||||
let last_update = this.feature_store.load().0;
|
||||
if let Ok(elapsed) = last_update.elapsed() {
|
||||
if elapsed < refresh_period {
|
||||
tracing::debug!(
|
||||
"Skipping feature flag refresh because it's too soon"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
let resp = match this
|
||||
.posthog_client
|
||||
.get_feature_flags_local_evaluation()
|
||||
@@ -58,16 +98,7 @@ impl FeatureResolverBackgroundLoop {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let project_id = this.posthog_client.config.project_id.parse::<u64>().ok();
|
||||
match FeatureStore::new_with_flags(resp.flags, project_id) {
|
||||
Ok(feature_store) => {
|
||||
this.feature_store.store(Arc::new(feature_store));
|
||||
tracing::info!("Feature flag updated");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Cannot process feature flag spec: {}", e);
|
||||
}
|
||||
}
|
||||
this.update_feature_store_nofail(resp, "refresh_loop");
|
||||
}
|
||||
tracing::info!("PostHog feature resolver stopped");
|
||||
}
|
||||
@@ -92,6 +123,6 @@ impl FeatureResolverBackgroundLoop {
|
||||
}
|
||||
|
||||
pub fn feature_store(&self) -> Arc<FeatureStore> {
|
||||
self.feature_store.load_full()
|
||||
self.feature_store.load().1.clone()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -544,17 +544,8 @@ impl PostHogClient {
|
||||
self.config.server_api_key.starts_with("phs_")
|
||||
}
|
||||
|
||||
/// Fetch the feature flag specs from the server.
|
||||
///
|
||||
/// This is unfortunately an undocumented API at:
|
||||
/// - <https://posthog.com/docs/api/feature-flags#get-api-projects-project_id-feature_flags-local_evaluation>
|
||||
/// - <https://posthog.com/docs/feature-flags/local-evaluation>
|
||||
///
|
||||
/// The handling logic in [`FeatureStore`] mostly follows the Python API implementation.
|
||||
/// See `_compute_flag_locally` in <https://github.com/PostHog/posthog-python/blob/master/posthog/client.py>
|
||||
pub async fn get_feature_flags_local_evaluation(
|
||||
&self,
|
||||
) -> anyhow::Result<LocalEvaluationResponse> {
|
||||
/// Get the raw JSON spec, same as `get_feature_flags_local_evaluation` but without parsing.
|
||||
pub async fn get_feature_flags_local_evaluation_raw(&self) -> anyhow::Result<String> {
|
||||
// BASE_URL/api/projects/:project_id/feature_flags/local_evaluation
|
||||
// with bearer token of self.server_api_key
|
||||
// OR
|
||||
@@ -588,7 +579,22 @@ impl PostHogClient {
|
||||
body
|
||||
));
|
||||
}
|
||||
Ok(serde_json::from_str(&body)?)
|
||||
Ok(body)
|
||||
}
|
||||
|
||||
/// Fetch the feature flag specs from the server.
|
||||
///
|
||||
/// This is unfortunately an undocumented API at:
|
||||
/// - <https://posthog.com/docs/api/feature-flags#get-api-projects-project_id-feature_flags-local_evaluation>
|
||||
/// - <https://posthog.com/docs/feature-flags/local-evaluation>
|
||||
///
|
||||
/// The handling logic in [`FeatureStore`] mostly follows the Python API implementation.
|
||||
/// See `_compute_flag_locally` in <https://github.com/PostHog/posthog-python/blob/master/posthog/client.py>
|
||||
pub async fn get_feature_flags_local_evaluation(
|
||||
&self,
|
||||
) -> Result<LocalEvaluationResponse, anyhow::Error> {
|
||||
let raw = self.get_feature_flags_local_evaluation_raw().await?;
|
||||
Ok(serde_json::from_str(&raw)?)
|
||||
}
|
||||
|
||||
/// Capture an event. This will only be used to report the feature flag usage back to PostHog, though
|
||||
|
||||
@@ -12,7 +12,9 @@ use tokio::net::TcpStream;
|
||||
|
||||
use crate::connect::connect;
|
||||
use crate::connect_raw::{RawConnection, connect_raw};
|
||||
use crate::tls::{MakeTlsConnect, TlsConnect};
|
||||
use crate::connect_tls::connect_tls;
|
||||
use crate::maybe_tls_stream::MaybeTlsStream;
|
||||
use crate::tls::{MakeTlsConnect, TlsConnect, TlsStream};
|
||||
use crate::{Client, Connection, Error};
|
||||
|
||||
/// TLS configuration.
|
||||
@@ -238,7 +240,7 @@ impl Config {
|
||||
connect(tls, self).await
|
||||
}
|
||||
|
||||
pub async fn connect_raw<S, T>(
|
||||
pub async fn tls_and_authenticate<S, T>(
|
||||
&self,
|
||||
stream: S,
|
||||
tls: T,
|
||||
@@ -247,7 +249,19 @@ impl Config {
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: TlsConnect<S>,
|
||||
{
|
||||
connect_raw(stream, tls, self).await
|
||||
let stream = connect_tls(stream, self.ssl_mode, tls).await?;
|
||||
connect_raw(stream, self).await
|
||||
}
|
||||
|
||||
pub async fn authenticate<S, T>(
|
||||
&self,
|
||||
stream: MaybeTlsStream<S, T>,
|
||||
) -> Result<RawConnection<S, T>, Error>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: TlsStream + Unpin,
|
||||
{
|
||||
connect_raw(stream, self).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ use crate::codec::BackendMessage;
|
||||
use crate::config::Host;
|
||||
use crate::connect_raw::connect_raw;
|
||||
use crate::connect_socket::connect_socket;
|
||||
use crate::connect_tls::connect_tls;
|
||||
use crate::tls::{MakeTlsConnect, TlsConnect};
|
||||
use crate::{Client, Config, Connection, Error, RawConnection};
|
||||
|
||||
@@ -44,13 +45,14 @@ where
|
||||
T: TlsConnect<TcpStream>,
|
||||
{
|
||||
let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
|
||||
let stream = connect_tls(socket, config.ssl_mode, tls).await?;
|
||||
let RawConnection {
|
||||
stream,
|
||||
parameters,
|
||||
delayed_notice,
|
||||
process_id,
|
||||
secret_key,
|
||||
} = connect_raw(socket, tls, config).await?;
|
||||
} = connect_raw(stream, config).await?;
|
||||
|
||||
let socket_config = SocketConfig {
|
||||
host_addr,
|
||||
|
||||
@@ -16,9 +16,8 @@ use tokio_util::codec::Framed;
|
||||
use crate::Error;
|
||||
use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
|
||||
use crate::config::{self, AuthKeys, Config};
|
||||
use crate::connect_tls::connect_tls;
|
||||
use crate::maybe_tls_stream::MaybeTlsStream;
|
||||
use crate::tls::{TlsConnect, TlsStream};
|
||||
use crate::tls::TlsStream;
|
||||
|
||||
pub struct StartupStream<S, T> {
|
||||
inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
|
||||
@@ -87,16 +86,13 @@ pub struct RawConnection<S, T> {
|
||||
}
|
||||
|
||||
pub async fn connect_raw<S, T>(
|
||||
stream: S,
|
||||
tls: T,
|
||||
stream: MaybeTlsStream<S, T>,
|
||||
config: &Config,
|
||||
) -> Result<RawConnection<S, T::Stream>, Error>
|
||||
) -> Result<RawConnection<S, T>, Error>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: TlsConnect<S>,
|
||||
T: TlsStream + Unpin,
|
||||
{
|
||||
let stream = connect_tls(stream, config.ssl_mode, tls).await?;
|
||||
|
||||
let mut stream = StartupStream {
|
||||
inner: Framed::new(stream, PostgresCodec),
|
||||
buf: BackendMessages::empty(),
|
||||
|
||||
@@ -210,7 +210,7 @@ pub struct TimelineStatus {
|
||||
}
|
||||
|
||||
/// Request to switch membership configuration.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct TimelineMembershipSwitchRequest {
|
||||
pub mconf: Configuration,
|
||||
@@ -221,6 +221,8 @@ pub struct TimelineMembershipSwitchRequest {
|
||||
pub struct TimelineMembershipSwitchResponse {
|
||||
pub previous_conf: Configuration,
|
||||
pub current_conf: Configuration,
|
||||
pub term: Term,
|
||||
pub flush_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Serialize, Deserialize)]
|
||||
|
||||
@@ -86,6 +86,14 @@ pub enum GateError {
|
||||
GateClosed,
|
||||
}
|
||||
|
||||
impl GateError {
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
GateError::GateClosed => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Gate {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
|
||||
@@ -844,4 +844,13 @@ impl Client {
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn update_feature_flag_spec(&self, spec: String) -> Result<()> {
|
||||
let uri = format!("{}/v1/feature_flag_spec", self.mgmt_api_endpoint);
|
||||
self.request(Method::POST, uri, spec)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,12 +9,14 @@ anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
futures.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
postgres_ffi_types.workspace = true
|
||||
prost.workspace = true
|
||||
prost-types.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tonic.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -35,6 +35,8 @@
|
||||
syntax = "proto3";
|
||||
package page_api;
|
||||
|
||||
import "google/protobuf/timestamp.proto";
|
||||
|
||||
service PageService {
|
||||
// Returns whether a relation exists.
|
||||
rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
|
||||
@@ -64,6 +66,10 @@ service PageService {
|
||||
|
||||
// Fetches an SLRU segment.
|
||||
rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
|
||||
|
||||
// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
|
||||
// collect the LSN until the lease expires. Must be acquired on all relevant shards.
|
||||
rpc LeaseLsn (LeaseLsnRequest) returns (LeaseLsnResponse);
|
||||
}
|
||||
|
||||
// The LSN a request should read at.
|
||||
@@ -110,6 +116,19 @@ message GetBaseBackupRequest {
|
||||
bool replica = 2;
|
||||
// If true, include relation files in the base backup. Mainly for debugging and tests.
|
||||
bool full = 3;
|
||||
// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
|
||||
// compression, so that we can cache compressed backups on the server.
|
||||
BaseBackupCompression compression = 4;
|
||||
}
|
||||
|
||||
// Base backup compression algorithms.
|
||||
enum BaseBackupCompression {
|
||||
// Unknown algorithm. Used when clients send an unsupported algorithm.
|
||||
BASE_BACKUP_COMPRESSION_UNKNOWN = 0;
|
||||
// No compression.
|
||||
BASE_BACKUP_COMPRESSION_NONE = 1;
|
||||
// GZIP compression.
|
||||
BASE_BACKUP_COMPRESSION_GZIP = 2;
|
||||
}
|
||||
|
||||
// Base backup response chunk, returned as an ordered stream.
|
||||
@@ -239,3 +258,17 @@ message GetSlruSegmentRequest {
|
||||
message GetSlruSegmentResponse {
|
||||
bytes segment = 1;
|
||||
}
|
||||
|
||||
// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
|
||||
// collect the LSN until the lease expires. Must be acquired on all relevant shards.
|
||||
message LeaseLsnRequest {
|
||||
// The LSN to lease. Can't be 0 or below the current GC cutoff.
|
||||
uint64 lsn = 1;
|
||||
}
|
||||
|
||||
// Lease acquisition response. If the lease could not be granted because the LSN has already been
|
||||
// garbage collected, a FailedPrecondition status will be returned instead.
|
||||
message LeaseLsnResponse {
|
||||
// The lease expiration time.
|
||||
google.protobuf.Timestamp expires = 1;
|
||||
}
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use std::convert::TryInto;
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::TryStreamExt;
|
||||
use futures::{Stream, StreamExt};
|
||||
use anyhow::Result;
|
||||
use futures::{Stream, StreamExt as _, TryStreamExt as _};
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tonic::metadata::AsciiMetadataValue;
|
||||
use tonic::metadata::errors::InvalidMetadataValue;
|
||||
use tonic::transport::Channel;
|
||||
@@ -12,8 +11,6 @@ use utils::id::TenantId;
|
||||
use utils::id::TimelineId;
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::model;
|
||||
use crate::proto;
|
||||
|
||||
@@ -69,6 +66,7 @@ impl tonic::service::Interceptor for AuthInterceptor {
|
||||
Ok(req)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Client {
|
||||
client: proto::PageServiceClient<
|
||||
@@ -95,7 +93,6 @@ impl Client {
|
||||
|
||||
if let Some(compression) = compression {
|
||||
// TODO: benchmark this (including network latency).
|
||||
// TODO: consider enabling compression by default.
|
||||
client = client
|
||||
.accept_compressed(compression)
|
||||
.send_compressed(compression);
|
||||
@@ -121,22 +118,15 @@ impl Client {
|
||||
pub async fn get_base_backup(
|
||||
&mut self,
|
||||
req: model::GetBaseBackupRequest,
|
||||
) -> Result<impl Stream<Item = Result<Bytes, tonic::Status>> + 'static, tonic::Status> {
|
||||
let proto_req = proto::GetBaseBackupRequest::from(req);
|
||||
|
||||
let response_stream: Streaming<proto::GetBaseBackupResponseChunk> =
|
||||
self.client.get_base_backup(proto_req).await?.into_inner();
|
||||
|
||||
// TODO: Consider dechunking internally
|
||||
let domain_stream = response_stream.map(|chunk_res| {
|
||||
chunk_res.and_then(|proto_chunk| {
|
||||
proto_chunk.try_into().map_err(|e| {
|
||||
tonic::Status::internal(format!("Failed to convert response chunk: {e}"))
|
||||
})
|
||||
})
|
||||
});
|
||||
|
||||
Ok(domain_stream)
|
||||
) -> Result<impl AsyncRead + use<>, tonic::Status> {
|
||||
let req = proto::GetBaseBackupRequest::from(req);
|
||||
let chunks = self.client.get_base_backup(req).await?.into_inner();
|
||||
let reader = StreamReader::new(
|
||||
chunks
|
||||
.map_ok(|resp| resp.chunk)
|
||||
.map_err(std::io::Error::other),
|
||||
);
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
/// Returns the total size of a database, as # of bytes.
|
||||
@@ -197,4 +187,17 @@ impl Client {
|
||||
let response = self.client.get_slru_segment(proto_req).await?;
|
||||
Ok(response.into_inner().try_into()?)
|
||||
}
|
||||
|
||||
/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
|
||||
/// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
|
||||
///
|
||||
/// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
|
||||
/// acquired because the LSN has already been garbage collected.
|
||||
pub async fn lease_lsn(
|
||||
&mut self,
|
||||
req: model::LeaseLsnRequest,
|
||||
) -> Result<model::LeaseLsnResponse, tonic::Status> {
|
||||
let req = proto::LeaseLsnRequest::from(req);
|
||||
Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,10 +16,11 @@
|
||||
//! stream combinators without dealing with errors, and avoids validating the same message twice.
|
||||
|
||||
use std::fmt::Display;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::Oid;
|
||||
// TODO: split out Lsn, RelTag, SlruKind, Oid and other basic types to a separate crate, to avoid
|
||||
use postgres_ffi_types::Oid;
|
||||
// TODO: split out Lsn, RelTag, SlruKind and other basic types to a separate crate, to avoid
|
||||
// pulling in all of their other crate dependencies when building the client.
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -191,15 +192,21 @@ pub struct GetBaseBackupRequest {
|
||||
pub replica: bool,
|
||||
/// If true, include relation files in the base backup. Mainly for debugging and tests.
|
||||
pub full: bool,
|
||||
/// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
|
||||
/// compression, so that we can cache compressed backups on the server.
|
||||
pub compression: BaseBackupCompression,
|
||||
}
|
||||
|
||||
impl From<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
fn from(pb: proto::GetBaseBackupRequest) -> Self {
|
||||
Self {
|
||||
impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(pb: proto::GetBaseBackupRequest) -> Result<Self, Self::Error> {
|
||||
Ok(Self {
|
||||
lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)),
|
||||
replica: pb.replica,
|
||||
full: pb.full,
|
||||
}
|
||||
compression: pb.compression.try_into()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,10 +216,55 @@ impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
|
||||
lsn: request.lsn.unwrap_or_default().0,
|
||||
replica: request.replica,
|
||||
full: request.full,
|
||||
compression: request.compression.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Base backup compression algorithm.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum BaseBackupCompression {
|
||||
None,
|
||||
Gzip,
|
||||
}
|
||||
|
||||
impl TryFrom<proto::BaseBackupCompression> for BaseBackupCompression {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(pb: proto::BaseBackupCompression) -> Result<Self, Self::Error> {
|
||||
match pb {
|
||||
proto::BaseBackupCompression::Unknown => Err(ProtocolError::invalid("compression", pb)),
|
||||
proto::BaseBackupCompression::None => Ok(Self::None),
|
||||
proto::BaseBackupCompression::Gzip => Ok(Self::Gzip),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<i32> for BaseBackupCompression {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(compression: i32) -> Result<Self, Self::Error> {
|
||||
proto::BaseBackupCompression::try_from(compression)
|
||||
.map_err(|_| ProtocolError::invalid("compression", compression))
|
||||
.and_then(Self::try_from)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BaseBackupCompression> for proto::BaseBackupCompression {
|
||||
fn from(compression: BaseBackupCompression) -> Self {
|
||||
match compression {
|
||||
BaseBackupCompression::None => Self::None,
|
||||
BaseBackupCompression::Gzip => Self::Gzip,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BaseBackupCompression> for i32 {
|
||||
fn from(compression: BaseBackupCompression) -> Self {
|
||||
proto::BaseBackupCompression::from(compression).into()
|
||||
}
|
||||
}
|
||||
|
||||
pub type GetBaseBackupResponseChunk = Bytes;
|
||||
|
||||
impl TryFrom<proto::GetBaseBackupResponseChunk> for GetBaseBackupResponseChunk {
|
||||
@@ -652,3 +704,54 @@ impl From<GetSlruSegmentResponse> for proto::GetSlruSegmentResponse {
|
||||
|
||||
// SlruKind is defined in pageserver_api::reltag.
|
||||
pub type SlruKind = pageserver_api::reltag::SlruKind;
|
||||
|
||||
/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
|
||||
/// collect the LSN until the lease expires.
|
||||
pub struct LeaseLsnRequest {
|
||||
/// The LSN to lease.
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl TryFrom<proto::LeaseLsnRequest> for LeaseLsnRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(pb: proto::LeaseLsnRequest) -> Result<Self, Self::Error> {
|
||||
if pb.lsn == 0 {
|
||||
return Err(ProtocolError::Missing("lsn"));
|
||||
}
|
||||
Ok(Self { lsn: Lsn(pb.lsn) })
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LeaseLsnRequest> for proto::LeaseLsnRequest {
|
||||
fn from(request: LeaseLsnRequest) -> Self {
|
||||
Self { lsn: request.lsn.0 }
|
||||
}
|
||||
}
|
||||
|
||||
/// Lease expiration time. If the lease could not be granted because the LSN has already been
|
||||
/// garbage collected, a FailedPrecondition status will be returned instead.
|
||||
pub type LeaseLsnResponse = SystemTime;
|
||||
|
||||
impl TryFrom<proto::LeaseLsnResponse> for LeaseLsnResponse {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(pb: proto::LeaseLsnResponse) -> Result<Self, Self::Error> {
|
||||
let expires = pb.expires.ok_or(ProtocolError::Missing("expires"))?;
|
||||
UNIX_EPOCH
|
||||
.checked_add(Duration::new(expires.seconds as u64, expires.nanos as u32))
|
||||
.ok_or_else(|| ProtocolError::invalid("expires", expires))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LeaseLsnResponse> for proto::LeaseLsnResponse {
|
||||
fn from(response: LeaseLsnResponse) -> Self {
|
||||
let expires = response.duration_since(UNIX_EPOCH).unwrap_or_default();
|
||||
Self {
|
||||
expires: Some(prost_types::Timestamp {
|
||||
seconds: expires.as_secs() as i64,
|
||||
nanos: expires.subsec_nanos() as i32,
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -317,6 +317,7 @@ impl Client for LibpqClient {
|
||||
/// A gRPC Pageserver client.
|
||||
struct GrpcClient {
|
||||
inner: page_api::Client,
|
||||
compression: page_api::BaseBackupCompression,
|
||||
}
|
||||
|
||||
impl GrpcClient {
|
||||
@@ -331,10 +332,14 @@ impl GrpcClient {
|
||||
ttid.timeline_id,
|
||||
ShardIndex::unsharded(),
|
||||
None,
|
||||
compression.then_some(tonic::codec::CompressionEncoding::Zstd),
|
||||
None, // NB: uses payload compression
|
||||
)
|
||||
.await?;
|
||||
Ok(Self { inner })
|
||||
let compression = match compression {
|
||||
true => page_api::BaseBackupCompression::Gzip,
|
||||
false => page_api::BaseBackupCompression::None,
|
||||
};
|
||||
Ok(Self { inner, compression })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -348,10 +353,8 @@ impl Client for GrpcClient {
|
||||
lsn,
|
||||
replica: false,
|
||||
full: false,
|
||||
compression: self.compression,
|
||||
};
|
||||
let stream = self.inner.get_base_backup(req).await?;
|
||||
Ok(Box::pin(StreamReader::new(
|
||||
stream.map_err(std::io::Error::other),
|
||||
)))
|
||||
Ok(Box::pin(self.inner.get_base_backup(req).await?))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ use std::fmt::Write as FmtWrite;
|
||||
use std::time::{Instant, SystemTime};
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{Key, rel_block_to_key};
|
||||
@@ -25,8 +26,7 @@ use postgres_ffi::{
|
||||
};
|
||||
use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM};
|
||||
use tokio::io;
|
||||
use tokio::io::AsyncWrite;
|
||||
use tokio::io::{self, AsyncWrite, AsyncWriteExt as _};
|
||||
use tokio_tar::{Builder, EntryType, Header};
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -97,6 +97,7 @@ impl From<BasebackupError> for tonic::Status {
|
||||
/// * When working without safekeepers. In this situation it is important to match the lsn
|
||||
/// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
|
||||
/// to start the replication.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn send_basebackup_tarball<'a, W>(
|
||||
write: &'a mut W,
|
||||
timeline: &'a Timeline,
|
||||
@@ -104,6 +105,7 @@ pub async fn send_basebackup_tarball<'a, W>(
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
replica: bool,
|
||||
gzip_level: Option<async_compression::Level>,
|
||||
ctx: &'a RequestContext,
|
||||
) -> Result<(), BasebackupError>
|
||||
where
|
||||
@@ -122,7 +124,7 @@ where
|
||||
// prev_lsn value; that happens if the timeline was just branched from
|
||||
// an old LSN and it doesn't have any WAL of its own yet. We will set
|
||||
// prev_lsn to Lsn(0) if we cannot provide the correct value.
|
||||
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
|
||||
let (backup_prev, lsn) = if let Some(req_lsn) = req_lsn {
|
||||
// Backup was requested at a particular LSN. The caller should've
|
||||
// already checked that it's a valid LSN.
|
||||
|
||||
@@ -143,7 +145,7 @@ where
|
||||
};
|
||||
|
||||
// Consolidate the derived and the provided prev_lsn values
|
||||
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||
let prev_record_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
|
||||
@@ -155,30 +157,55 @@ where
|
||||
};
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
|
||||
backup_lsn, prev_lsn, full_backup, replica
|
||||
"taking basebackup lsn={lsn}, prev_lsn={prev_record_lsn} \
|
||||
(full_backup={full_backup}, replica={replica}, gzip={gzip_level:?})",
|
||||
);
|
||||
let span = info_span!("send_tarball", backup_lsn=%lsn);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
timeline.conf.get_vectored_concurrent_io,
|
||||
timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| BasebackupError::Shutdown)?,
|
||||
);
|
||||
|
||||
let basebackup = Basebackup {
|
||||
ar: Builder::new_non_terminated(write),
|
||||
timeline,
|
||||
lsn: backup_lsn,
|
||||
prev_record_lsn: prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
io_concurrency: IoConcurrency::spawn_from_conf(
|
||||
timeline.conf.get_vectored_concurrent_io,
|
||||
timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| BasebackupError::Shutdown)?,
|
||||
),
|
||||
};
|
||||
basebackup
|
||||
if let Some(gzip_level) = gzip_level {
|
||||
let mut encoder = GzipEncoder::with_quality(write, gzip_level);
|
||||
Basebackup {
|
||||
ar: Builder::new_non_terminated(&mut encoder),
|
||||
timeline,
|
||||
lsn,
|
||||
prev_record_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
io_concurrency,
|
||||
}
|
||||
.send_tarball()
|
||||
.instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
|
||||
.await
|
||||
.instrument(span)
|
||||
.await?;
|
||||
encoder
|
||||
.shutdown()
|
||||
.await
|
||||
.map_err(|err| BasebackupError::Client(err, "gzip"))?;
|
||||
} else {
|
||||
Basebackup {
|
||||
ar: Builder::new_non_terminated(write),
|
||||
timeline,
|
||||
lsn,
|
||||
prev_record_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
io_concurrency,
|
||||
}
|
||||
.send_tarball()
|
||||
.instrument(span)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use anyhow::Context;
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use metrics::core::{AtomicU64, GenericCounter};
|
||||
use pageserver_api::{config::BasebackupCacheConfig, models::TenantState};
|
||||
use tokio::{
|
||||
io::{AsyncWriteExt, BufWriter},
|
||||
sync::mpsc::{UnboundedReceiver, UnboundedSender},
|
||||
sync::mpsc::{Receiver, Sender, error::TrySendError},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
@@ -20,8 +19,8 @@ use crate::{
|
||||
basebackup::send_basebackup_tarball,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
metrics::{
|
||||
BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ,
|
||||
BASEBACKUP_CACHE_SIZE,
|
||||
BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE,
|
||||
BASEBACKUP_CACHE_READ, BASEBACKUP_CACHE_SIZE,
|
||||
},
|
||||
task_mgr::TaskKind,
|
||||
tenant::{
|
||||
@@ -36,8 +35,8 @@ pub struct BasebackupPrepareRequest {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
|
||||
pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
|
||||
pub type BasebackupPrepareSender = Sender<BasebackupPrepareRequest>;
|
||||
pub type BasebackupPrepareReceiver = Receiver<BasebackupPrepareRequest>;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct CacheEntry {
|
||||
@@ -61,40 +60,65 @@ struct CacheEntry {
|
||||
/// and ~1 RPS for get requests.
|
||||
pub struct BasebackupCache {
|
||||
data_dir: Utf8PathBuf,
|
||||
config: Option<BasebackupCacheConfig>,
|
||||
|
||||
entries: std::sync::Mutex<HashMap<TenantTimelineId, CacheEntry>>,
|
||||
|
||||
prepare_sender: BasebackupPrepareSender,
|
||||
|
||||
read_hit_count: GenericCounter<AtomicU64>,
|
||||
read_miss_count: GenericCounter<AtomicU64>,
|
||||
read_err_count: GenericCounter<AtomicU64>,
|
||||
|
||||
prepare_skip_count: GenericCounter<AtomicU64>,
|
||||
}
|
||||
|
||||
impl BasebackupCache {
|
||||
/// Creates a BasebackupCache and spawns the background task.
|
||||
/// The initialization of the cache is performed in the background and does not
|
||||
/// block the caller. The cache will return `None` for any get requests until
|
||||
/// initialization is complete.
|
||||
pub fn spawn(
|
||||
runtime_handle: &tokio::runtime::Handle,
|
||||
/// Create a new BasebackupCache instance.
|
||||
/// Also returns a BasebackupPrepareReceiver which is needed to start
|
||||
/// the background task.
|
||||
/// The cache is initialized from the data_dir in the background task.
|
||||
/// The cache will return `None` for any get requests until the initialization is complete.
|
||||
/// The background task is spawned separately using [`Self::spawn_background_task`]
|
||||
/// to avoid a circular dependency between the cache and the tenant manager.
|
||||
pub fn new(
|
||||
data_dir: Utf8PathBuf,
|
||||
config: Option<BasebackupCacheConfig>,
|
||||
prepare_receiver: BasebackupPrepareReceiver,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
) -> (Arc<Self>, BasebackupPrepareReceiver) {
|
||||
let chan_size = config.as_ref().map(|c| c.max_size_entries).unwrap_or(1);
|
||||
|
||||
let (prepare_sender, prepare_receiver) = tokio::sync::mpsc::channel(chan_size);
|
||||
|
||||
let cache = Arc::new(BasebackupCache {
|
||||
data_dir,
|
||||
|
||||
config,
|
||||
entries: std::sync::Mutex::new(HashMap::new()),
|
||||
prepare_sender,
|
||||
|
||||
read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
|
||||
read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
|
||||
read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
|
||||
|
||||
prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
|
||||
});
|
||||
|
||||
if let Some(config) = config {
|
||||
(cache, prepare_receiver)
|
||||
}
|
||||
|
||||
/// Spawns the background task.
|
||||
/// The background task initializes the cache from the disk,
|
||||
/// processes prepare requests, and cleans up outdated cache entries.
|
||||
/// Noop if the cache is disabled (config is None).
|
||||
pub fn spawn_background_task(
|
||||
self: Arc<Self>,
|
||||
runtime_handle: &tokio::runtime::Handle,
|
||||
prepare_receiver: BasebackupPrepareReceiver,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
if let Some(config) = self.config.clone() {
|
||||
let background = BackgroundTask {
|
||||
c: cache.clone(),
|
||||
c: self,
|
||||
|
||||
config,
|
||||
tenant_manager,
|
||||
@@ -109,8 +133,45 @@ impl BasebackupCache {
|
||||
};
|
||||
runtime_handle.spawn(background.run(prepare_receiver));
|
||||
}
|
||||
}
|
||||
|
||||
cache
|
||||
/// Send a basebackup prepare request to the background task.
|
||||
/// The basebackup will be prepared asynchronously, it does not block the caller.
|
||||
/// The request will be skipped if any cache limits are exceeded.
|
||||
pub fn send_prepare(&self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn) {
|
||||
let req = BasebackupPrepareRequest {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
};
|
||||
|
||||
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.inc();
|
||||
let res = self.prepare_sender.try_send(req);
|
||||
|
||||
if let Err(e) = res {
|
||||
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
|
||||
self.prepare_skip_count.inc();
|
||||
match e {
|
||||
TrySendError::Full(_) => {
|
||||
// Basebackup prepares are pretty rare, normally we should not hit this.
|
||||
tracing::info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
%timeline_id,
|
||||
%lsn,
|
||||
"Basebackup prepare channel is full, skipping the request"
|
||||
);
|
||||
}
|
||||
TrySendError::Closed(_) => {
|
||||
// Normal during shutdown, not critical.
|
||||
tracing::info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
%timeline_id,
|
||||
%lsn,
|
||||
"Basebackup prepare channel is closed, skipping the request"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets a basebackup entry from the cache.
|
||||
@@ -123,6 +184,10 @@ impl BasebackupCache {
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Option<tokio::fs::File> {
|
||||
if !self.is_enabled() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Fast path. Check if the entry exists using the in-memory state.
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) {
|
||||
@@ -150,6 +215,10 @@ impl BasebackupCache {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_enabled(&self) -> bool {
|
||||
self.config.is_some()
|
||||
}
|
||||
|
||||
// Private methods.
|
||||
|
||||
fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String {
|
||||
@@ -367,6 +436,7 @@ impl BackgroundTask {
|
||||
loop {
|
||||
tokio::select! {
|
||||
Some(req) = prepare_receiver.recv() => {
|
||||
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
|
||||
if let Err(err) = self.prepare_basebackup(
|
||||
req.tenant_shard_id,
|
||||
req.timeline_id,
|
||||
@@ -594,13 +664,6 @@ impl BackgroundTask {
|
||||
let file = tokio::fs::File::create(entry_tmp_path).await?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
let mut encoder = GzipEncoder::with_quality(
|
||||
&mut writer,
|
||||
// Level::Best because compression is not on the hot path of basebackup requests.
|
||||
// The decompression is almost not affected by the compression level.
|
||||
async_compression::Level::Best,
|
||||
);
|
||||
|
||||
// We may receive a request before the WAL record is applied to the timeline.
|
||||
// Wait for the requested LSN to be applied.
|
||||
timeline
|
||||
@@ -613,17 +676,19 @@ impl BackgroundTask {
|
||||
.await?;
|
||||
|
||||
send_basebackup_tarball(
|
||||
&mut encoder,
|
||||
&mut writer,
|
||||
timeline,
|
||||
Some(req_lsn),
|
||||
None,
|
||||
false,
|
||||
false,
|
||||
// Level::Best because compression is not on the hot path of basebackup requests.
|
||||
// The decompression is almost not affected by the compression level.
|
||||
Some(async_compression::Level::Best),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
encoder.shutdown().await?;
|
||||
writer.flush().await?;
|
||||
writer.into_inner().sync_all().await?;
|
||||
|
||||
|
||||
@@ -569,8 +569,10 @@ fn start_pageserver(
|
||||
pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
|
||||
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
let (basebackup_prepare_sender, basebackup_prepare_receiver) =
|
||||
tokio::sync::mpsc::unbounded_channel();
|
||||
let (basebackup_cache, basebackup_prepare_receiver) = BasebackupCache::new(
|
||||
conf.basebackup_cache_dir(),
|
||||
conf.basebackup_cache_config.clone(),
|
||||
);
|
||||
let deletion_queue_client = deletion_queue.new_client();
|
||||
let background_purges = mgr::BackgroundPurges::default();
|
||||
|
||||
@@ -582,7 +584,7 @@ fn start_pageserver(
|
||||
remote_storage: remote_storage.clone(),
|
||||
deletion_queue_client,
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
basebackup_cache: Arc::clone(&basebackup_cache),
|
||||
feature_resolver: feature_resolver.clone(),
|
||||
},
|
||||
shutdown_pageserver.clone(),
|
||||
@@ -590,10 +592,8 @@ fn start_pageserver(
|
||||
let tenant_manager = Arc::new(tenant_manager);
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?;
|
||||
|
||||
let basebackup_cache = BasebackupCache::spawn(
|
||||
basebackup_cache.spawn_background_task(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
conf.basebackup_cache_dir(),
|
||||
conf.basebackup_cache_config.clone(),
|
||||
basebackup_prepare_receiver,
|
||||
Arc::clone(&tenant_manager),
|
||||
shutdown_pageserver.child_token(),
|
||||
@@ -806,7 +806,6 @@ fn start_pageserver(
|
||||
} else {
|
||||
None
|
||||
},
|
||||
basebackup_cache,
|
||||
);
|
||||
|
||||
// Spawn a Pageserver gRPC server task. It will spawn separate tasks for
|
||||
|
||||
@@ -37,7 +37,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
not_modified_since: Lsn(23),
|
||||
},
|
||||
batch_key: 42,
|
||||
message: format!("message {}", msg),
|
||||
message: format!("message {msg}"),
|
||||
}));
|
||||
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
|
||||
eprintln!("pipe seems full");
|
||||
|
||||
@@ -762,4 +762,40 @@ mod tests {
|
||||
let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir);
|
||||
assert_eq!(result.is_ok(), is_valid);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_posthog_config_is_valid() {
|
||||
let input = r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
|
||||
[posthog_config]
|
||||
server_api_key = "phs_AAA"
|
||||
client_api_key = "phc_BBB"
|
||||
project_id = "000"
|
||||
private_api_url = "https://us.posthog.com"
|
||||
public_api_url = "https://us.i.posthog.com"
|
||||
"#;
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
|
||||
.expect("posthogconfig is valid");
|
||||
let workdir = Utf8PathBuf::from("/nonexistent");
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_posthog_incomplete_config_is_valid() {
|
||||
let input = r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
|
||||
[posthog_config]
|
||||
server_api_key = "phs_AAA"
|
||||
private_api_url = "https://us.posthog.com"
|
||||
public_api_url = "https://us.i.posthog.com"
|
||||
"#;
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
|
||||
.expect("posthogconfig is valid");
|
||||
let workdir = Utf8PathBuf::from("/nonexistent");
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
use arc_swap::ArcSwap;
|
||||
use pageserver_api::config::NodeMetadata;
|
||||
use posthog_client_lite::{
|
||||
CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
|
||||
CaptureEvent, FeatureResolverBackgroundLoop, PostHogEvaluationError,
|
||||
PostHogFlagFilterPropertyValue,
|
||||
};
|
||||
use remote_storage::RemoteStorageKind;
|
||||
@@ -31,6 +31,13 @@ impl FeatureResolver {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update(&self, spec: String) -> anyhow::Result<()> {
|
||||
if let Some(inner) = &self.inner {
|
||||
inner.update(spec)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn spawn(
|
||||
conf: &PageServerConf,
|
||||
shutdown_pageserver: CancellationToken,
|
||||
@@ -38,16 +45,24 @@ impl FeatureResolver {
|
||||
) -> anyhow::Result<Self> {
|
||||
// DO NOT block in this function: make it return as fast as possible to avoid startup delays.
|
||||
if let Some(posthog_config) = &conf.posthog_config {
|
||||
let inner = FeatureResolverBackgroundLoop::new(
|
||||
PostHogClientConfig {
|
||||
server_api_key: posthog_config.server_api_key.clone(),
|
||||
client_api_key: posthog_config.client_api_key.clone(),
|
||||
project_id: posthog_config.project_id.clone(),
|
||||
private_api_url: posthog_config.private_api_url.clone(),
|
||||
public_api_url: posthog_config.public_api_url.clone(),
|
||||
},
|
||||
shutdown_pageserver,
|
||||
);
|
||||
let posthog_client_config = match posthog_config.clone().try_into_posthog_config() {
|
||||
Ok(config) => config,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"invalid posthog config, skipping posthog integration: {}",
|
||||
e
|
||||
);
|
||||
return Ok(FeatureResolver {
|
||||
inner: None,
|
||||
internal_properties: None,
|
||||
force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(
|
||||
HashMap::new(),
|
||||
))),
|
||||
});
|
||||
}
|
||||
};
|
||||
let inner =
|
||||
FeatureResolverBackgroundLoop::new(posthog_client_config, shutdown_pageserver);
|
||||
let inner = Arc::new(inner);
|
||||
|
||||
// The properties shared by all tenants on this pageserver.
|
||||
|
||||
@@ -1893,9 +1893,13 @@ async fn update_tenant_config_handler(
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf.clone(),
|
||||
tenant.get_generation(),
|
||||
&ShardParameters::default(),
|
||||
ShardParameters::from(tenant.get_shard_identity()),
|
||||
);
|
||||
|
||||
tenant
|
||||
.get_shard_identity()
|
||||
.assert_equal(location_conf.shard); // not strictly necessary since we construct it above
|
||||
|
||||
crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
@@ -1937,9 +1941,13 @@ async fn patch_tenant_config_handler(
|
||||
let location_conf = LocationConf::attached_single(
|
||||
updated,
|
||||
tenant.get_generation(),
|
||||
&ShardParameters::default(),
|
||||
ShardParameters::from(tenant.get_shard_identity()),
|
||||
);
|
||||
|
||||
tenant
|
||||
.get_shard_identity()
|
||||
.assert_equal(location_conf.shard); // not strictly necessary since we construct it above
|
||||
|
||||
crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
@@ -3743,6 +3751,20 @@ async fn force_override_feature_flag_for_testing_delete(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn update_feature_flag_spec(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let body = json_request(&mut request).await?;
|
||||
let state = get_state(&request);
|
||||
state
|
||||
.feature_resolver
|
||||
.update(body)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Common functionality of all the HTTP API handlers.
|
||||
///
|
||||
/// - Adds a tracing span to each request (by `request_span`)
|
||||
@@ -4128,5 +4150,8 @@ pub fn make_router(
|
||||
.delete("/v1/feature_flag/:flag_key", |r| {
|
||||
testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete)
|
||||
})
|
||||
.post("/v1/feature_flag_spec", |r| {
|
||||
api_handler(r, update_feature_flag_spec)
|
||||
})
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -4439,6 +4439,14 @@ pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_basebackup_cache_prepare_queue_size",
|
||||
"Number of requests in the basebackup prepare channel"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_config_ignored_items",
|
||||
|
||||
@@ -12,9 +12,9 @@ use std::task::{Context, Poll};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
use std::{io, str};
|
||||
|
||||
use anyhow::{Context as _, anyhow, bail};
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use anyhow::{Context as _, bail};
|
||||
use bytes::{Buf as _, BufMut as _, BytesMut};
|
||||
use chrono::Utc;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, Stream};
|
||||
use itertools::Itertools;
|
||||
@@ -63,7 +63,6 @@ use utils::{failpoint_support, span_record};
|
||||
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup::{self, BasebackupError};
|
||||
use crate::basebackup_cache::BasebackupCache;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{
|
||||
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
@@ -138,7 +137,6 @@ pub fn spawn(
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
@@ -160,7 +158,6 @@ pub fn spawn(
|
||||
conf.pg_auth_type,
|
||||
tls_config,
|
||||
conf.page_service_pipelining.clone(),
|
||||
basebackup_cache,
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
)
|
||||
@@ -219,7 +216,6 @@ pub async fn libpq_listener_main(
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
) -> Connections {
|
||||
@@ -263,7 +259,6 @@ pub async fn libpq_listener_main(
|
||||
auth_type,
|
||||
tls_config.clone(),
|
||||
pipelining_config.clone(),
|
||||
Arc::clone(&basebackup_cache),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
gate_guard,
|
||||
@@ -306,7 +301,6 @@ async fn page_service_conn_main(
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
@@ -372,7 +366,6 @@ async fn page_service_conn_main(
|
||||
pipelining_config,
|
||||
conf.get_vectored_concurrent_io,
|
||||
perf_span_fields,
|
||||
basebackup_cache,
|
||||
connection_ctx,
|
||||
cancel.clone(),
|
||||
gate_guard,
|
||||
@@ -426,8 +419,6 @@ struct PageServerHandler {
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
|
||||
gate_guard: GateGuard,
|
||||
}
|
||||
|
||||
@@ -913,7 +904,6 @@ impl PageServerHandler {
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
perf_span_fields: ConnectionPerfSpanFields,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
@@ -927,7 +917,6 @@ impl PageServerHandler {
|
||||
cancel,
|
||||
pipelining_config,
|
||||
get_vectored_concurrent_io,
|
||||
basebackup_cache,
|
||||
gate_guard,
|
||||
}
|
||||
}
|
||||
@@ -2613,26 +2602,16 @@ impl PageServerHandler {
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
None,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
let mut writer = BufWriter::new(pgb.copyout_writer());
|
||||
|
||||
let cached = {
|
||||
// Basebackup is cached only for this combination of parameters.
|
||||
if timeline.is_basebackup_cache_enabled()
|
||||
&& gzip
|
||||
&& lsn.is_some()
|
||||
&& prev_lsn.is_none()
|
||||
{
|
||||
self.basebackup_cache
|
||||
.get(tenant_id, timeline_id, lsn.unwrap())
|
||||
.await
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
let cached = timeline
|
||||
.get_cached_basebackup_if_enabled(lsn, prev_lsn, full_backup, replica, gzip)
|
||||
.await;
|
||||
|
||||
if let Some(mut cached) = cached {
|
||||
from_cache = true;
|
||||
@@ -2641,31 +2620,6 @@ impl PageServerHandler {
|
||||
.map_err(|err| {
|
||||
BasebackupError::Client(err, "handle_basebackup_request,cached,copy")
|
||||
})?;
|
||||
} else if gzip {
|
||||
let mut encoder = GzipEncoder::with_quality(
|
||||
&mut writer,
|
||||
// NOTE using fast compression because it's on the critical path
|
||||
// for compute startup. For an empty database, we get
|
||||
// <100KB with this method. The Level::Best compression method
|
||||
// gives us <20KB, but maybe we should add basebackup caching
|
||||
// on compute shutdown first.
|
||||
async_compression::Level::Fastest,
|
||||
);
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut encoder,
|
||||
&timeline,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
// shutdown the encoder to ensure the gzip footer is written
|
||||
encoder
|
||||
.shutdown()
|
||||
.await
|
||||
.map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
|
||||
} else {
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut writer,
|
||||
@@ -2674,6 +2628,11 @@ impl PageServerHandler {
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
// NB: using fast compression because it's on the critical path for compute
|
||||
// startup. For an empty database, we get <100KB with this method. The
|
||||
// Level::Best compression method gives us <20KB, but maybe we should add
|
||||
// basebackup caching on compute shutdown first.
|
||||
gzip.then_some(async_compression::Level::Fastest),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -3553,7 +3512,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
if timeline.is_archived() == Some(true) {
|
||||
return Err(tonic::Status::failed_precondition("timeline is archived"));
|
||||
}
|
||||
let req: page_api::GetBaseBackupRequest = req.into_inner().into();
|
||||
let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?;
|
||||
|
||||
span_record!(lsn=?req.lsn);
|
||||
|
||||
@@ -3579,20 +3538,50 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
let span = Span::current();
|
||||
let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
|
||||
let jh = tokio::spawn(async move {
|
||||
let result = basebackup::send_basebackup_tarball(
|
||||
&mut simplex_write,
|
||||
&timeline,
|
||||
req.lsn,
|
||||
None,
|
||||
req.full,
|
||||
req.replica,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(span) // propagate request span
|
||||
.await;
|
||||
simplex_write.shutdown().await.map_err(|err| {
|
||||
BasebackupError::Server(anyhow!("simplex shutdown failed: {err}"))
|
||||
})?;
|
||||
let gzip_level = match req.compression {
|
||||
page_api::BaseBackupCompression::None => None,
|
||||
// NB: using fast compression because it's on the critical path for compute
|
||||
// startup. For an empty database, we get <100KB with this method. The
|
||||
// Level::Best compression method gives us <20KB, but maybe we should add
|
||||
// basebackup caching on compute shutdown first.
|
||||
page_api::BaseBackupCompression::Gzip => Some(async_compression::Level::Fastest),
|
||||
};
|
||||
|
||||
// Check for a cached basebackup.
|
||||
let cached = timeline
|
||||
.get_cached_basebackup_if_enabled(
|
||||
req.lsn,
|
||||
None,
|
||||
req.full,
|
||||
req.replica,
|
||||
gzip_level.is_some(),
|
||||
)
|
||||
.await;
|
||||
|
||||
let result = if let Some(mut cached) = cached {
|
||||
// If we have a cached basebackup, send it.
|
||||
tokio::io::copy(&mut cached, &mut simplex_write)
|
||||
.await
|
||||
.map(|_| ())
|
||||
.map_err(|err| BasebackupError::Client(err, "cached,copy"))
|
||||
} else {
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut simplex_write,
|
||||
&timeline,
|
||||
req.lsn,
|
||||
None,
|
||||
req.full,
|
||||
req.replica,
|
||||
gzip_level,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(span) // propagate request span
|
||||
.await
|
||||
};
|
||||
simplex_write
|
||||
.shutdown()
|
||||
.await
|
||||
.map_err(|err| BasebackupError::Client(err, "simplex_write"))?;
|
||||
result
|
||||
});
|
||||
|
||||
@@ -3772,6 +3761,36 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
let resp: page_api::GetSlruSegmentResponse = resp.segment;
|
||||
Ok(tonic::Response::new(resp.into()))
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(lsn))]
|
||||
async fn lease_lsn(
|
||||
&self,
|
||||
req: tonic::Request<proto::LeaseLsnRequest>,
|
||||
) -> Result<tonic::Response<proto::LeaseLsnResponse>, tonic::Status> {
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
|
||||
// Validate and convert the request, and decorate the span.
|
||||
let req: page_api::LeaseLsnRequest = req.into_inner().try_into()?;
|
||||
|
||||
span_record!(lsn=%req.lsn);
|
||||
|
||||
// Attempt to acquire a lease. Return FailedPrecondition if the lease could not be granted.
|
||||
let lease_length = timeline.get_lsn_lease_length();
|
||||
let expires = match timeline.renew_lsn_lease(req.lsn, lease_length, &ctx) {
|
||||
Ok(lease) => lease.valid_until,
|
||||
Err(err) => return Err(tonic::Status::failed_precondition(format!("{err}"))),
|
||||
};
|
||||
|
||||
// TODO: is this spammy? Move it compute-side?
|
||||
info!(
|
||||
"acquired lease for {} until {}",
|
||||
req.lsn,
|
||||
chrono::DateTime::<Utc>::from(expires).to_rfc3339()
|
||||
);
|
||||
|
||||
Ok(tonic::Response::new(expires.into()))
|
||||
}
|
||||
}
|
||||
|
||||
/// gRPC middleware layer that handles observability concerns:
|
||||
|
||||
@@ -3015,7 +3015,7 @@ mod tests {
|
||||
// This shard will get the even blocks
|
||||
let shard = ShardIdentity::from_params(
|
||||
ShardNumber(0),
|
||||
&ShardParameters {
|
||||
ShardParameters {
|
||||
count: ShardCount(2),
|
||||
stripe_size: ShardStripeSize(1),
|
||||
},
|
||||
|
||||
@@ -80,7 +80,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit
|
||||
use self::timeline::{
|
||||
EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
|
||||
};
|
||||
use crate::basebackup_cache::BasebackupPrepareSender;
|
||||
use crate::basebackup_cache::BasebackupCache;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context;
|
||||
use crate::context::RequestContextBuilder;
|
||||
@@ -162,7 +162,7 @@ pub struct TenantSharedResources {
|
||||
pub remote_storage: GenericRemoteStorage,
|
||||
pub deletion_queue_client: DeletionQueueClient,
|
||||
pub l0_flush_global_state: L0FlushGlobalState,
|
||||
pub basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
pub basebackup_cache: Arc<BasebackupCache>,
|
||||
pub feature_resolver: FeatureResolver,
|
||||
}
|
||||
|
||||
@@ -331,7 +331,7 @@ pub struct TenantShard {
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
|
||||
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
|
||||
basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
|
||||
/// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
@@ -1363,7 +1363,7 @@ impl TenantShard {
|
||||
remote_storage,
|
||||
deletion_queue_client,
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
basebackup_cache,
|
||||
feature_resolver,
|
||||
} = resources;
|
||||
|
||||
@@ -1380,7 +1380,7 @@ impl TenantShard {
|
||||
remote_storage.clone(),
|
||||
deletion_queue_client,
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
basebackup_cache,
|
||||
feature_resolver,
|
||||
));
|
||||
|
||||
@@ -3872,6 +3872,10 @@ impl TenantShard {
|
||||
&self.tenant_shard_id
|
||||
}
|
||||
|
||||
pub(crate) fn get_shard_identity(&self) -> ShardIdentity {
|
||||
self.shard_identity
|
||||
}
|
||||
|
||||
pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
|
||||
self.shard_identity.stripe_size
|
||||
}
|
||||
@@ -4380,7 +4384,7 @@ impl TenantShard {
|
||||
remote_storage: GenericRemoteStorage,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
feature_resolver: FeatureResolver,
|
||||
) -> TenantShard {
|
||||
assert!(!attached_conf.location.generation.is_none());
|
||||
@@ -4485,7 +4489,7 @@ impl TenantShard {
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
gc_block: Default::default(),
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
basebackup_cache,
|
||||
feature_resolver,
|
||||
}
|
||||
}
|
||||
@@ -4525,6 +4529,10 @@ impl TenantShard {
|
||||
Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
|
||||
}
|
||||
|
||||
/// Stores a tenant location config to disk.
|
||||
///
|
||||
/// NB: make sure to call `ShardIdentity::assert_equal` before persisting a new config, to avoid
|
||||
/// changes to shard parameters that may result in data corruption.
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(super) async fn persist_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -5414,7 +5422,7 @@ impl TenantShard {
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_compaction_trigger: self.l0_compaction_trigger.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
|
||||
basebackup_cache: self.basebackup_cache.clone(),
|
||||
feature_resolver: self.feature_resolver.clone(),
|
||||
}
|
||||
}
|
||||
@@ -6000,7 +6008,7 @@ pub(crate) mod harness {
|
||||
) -> anyhow::Result<Arc<TenantShard>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel();
|
||||
let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);
|
||||
|
||||
let tenant = Arc::new(TenantShard::new(
|
||||
TenantState::Attaching,
|
||||
@@ -6008,7 +6016,7 @@ pub(crate) mod harness {
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
self.tenant_conf.clone(),
|
||||
self.generation,
|
||||
&ShardParameters::default(),
|
||||
ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
self.shard_identity,
|
||||
@@ -6018,7 +6026,7 @@ pub(crate) mod harness {
|
||||
self.deletion_queue.new_client(),
|
||||
// TODO: ideally we should run all unit tests with both configs
|
||||
L0FlushGlobalState::new(L0FlushConfig::default()),
|
||||
basebackup_requst_sender,
|
||||
basebackup_cache,
|
||||
FeatureResolver::new_disabled(),
|
||||
));
|
||||
|
||||
@@ -11429,11 +11437,11 @@ mod tests {
|
||||
if left != right {
|
||||
eprintln!("---LEFT---");
|
||||
for left in left.iter() {
|
||||
eprintln!("{}", left);
|
||||
eprintln!("{left}");
|
||||
}
|
||||
eprintln!("---RIGHT---");
|
||||
for right in right.iter() {
|
||||
eprintln!("{}", right);
|
||||
eprintln!("{right}");
|
||||
}
|
||||
assert_eq!(left, right);
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::critical;
|
||||
use utils::generation::Generation;
|
||||
|
||||
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
@@ -136,7 +137,7 @@ impl LocationConf {
|
||||
pub(crate) fn attached_single(
|
||||
tenant_conf: pageserver_api::models::TenantConfig,
|
||||
generation: Generation,
|
||||
shard_params: &models::ShardParameters,
|
||||
shard_params: models::ShardParameters,
|
||||
) -> Self {
|
||||
Self {
|
||||
mode: LocationMode::Attached(AttachedLocationConfig {
|
||||
@@ -171,6 +172,16 @@ impl LocationConf {
|
||||
}
|
||||
}
|
||||
|
||||
// This should never happen.
|
||||
// TODO: turn this into a proper assertion.
|
||||
if stripe_size != self.shard.stripe_size {
|
||||
critical!(
|
||||
"stripe size mismatch: {} != {}",
|
||||
self.shard.stripe_size,
|
||||
stripe_size,
|
||||
);
|
||||
}
|
||||
|
||||
self.shard.stripe_size = stripe_size;
|
||||
}
|
||||
|
||||
|
||||
@@ -880,6 +880,9 @@ impl TenantManager {
|
||||
// phase of writing config and/or waiting for flush, before returning.
|
||||
match fast_path_taken {
|
||||
Some(FastPathModified::Attached(tenant)) => {
|
||||
tenant
|
||||
.shard_identity
|
||||
.assert_equal(new_location_config.shard);
|
||||
TenantShard::persist_tenant_config(
|
||||
self.conf,
|
||||
&tenant_shard_id,
|
||||
@@ -914,7 +917,10 @@ impl TenantManager {
|
||||
|
||||
return Ok(Some(tenant));
|
||||
}
|
||||
Some(FastPathModified::Secondary(_secondary_tenant)) => {
|
||||
Some(FastPathModified::Secondary(secondary_tenant)) => {
|
||||
secondary_tenant
|
||||
.shard_identity
|
||||
.assert_equal(new_location_config.shard);
|
||||
TenantShard::persist_tenant_config(
|
||||
self.conf,
|
||||
&tenant_shard_id,
|
||||
@@ -948,6 +954,10 @@ impl TenantManager {
|
||||
|
||||
match slot_guard.get_old_value() {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
tenant
|
||||
.shard_identity
|
||||
.assert_equal(new_location_config.shard);
|
||||
|
||||
// The case where we keep a Tenant alive was covered above in the special case
|
||||
// for Attached->Attached transitions in the same generation. By this point,
|
||||
// if we see an attached tenant we know it will be discarded and should be
|
||||
@@ -981,9 +991,13 @@ impl TenantManager {
|
||||
// rather than assuming it to be empty.
|
||||
spawn_mode = SpawnMode::Eager;
|
||||
}
|
||||
Some(TenantSlot::Secondary(state)) => {
|
||||
Some(TenantSlot::Secondary(secondary_tenant)) => {
|
||||
secondary_tenant
|
||||
.shard_identity
|
||||
.assert_equal(new_location_config.shard);
|
||||
|
||||
info!("Shutting down secondary tenant");
|
||||
state.shutdown().await;
|
||||
secondary_tenant.shutdown().await;
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
// This should never happen: acquire_slot should error out
|
||||
@@ -2200,7 +2214,7 @@ impl TenantManager {
|
||||
selector: ShardSelector,
|
||||
) -> ShardResolveResult {
|
||||
let tenants = self.tenants.read().unwrap();
|
||||
let mut want_shard = None;
|
||||
let mut want_shard: Option<ShardIndex> = None;
|
||||
let mut any_in_progress = None;
|
||||
|
||||
match &*tenants {
|
||||
@@ -2225,14 +2239,23 @@ impl TenantManager {
|
||||
return ShardResolveResult::Found(tenant.clone());
|
||||
}
|
||||
ShardSelector::Page(key) => {
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
// Each time we find an attached slot with a different shard count,
|
||||
// recompute the expected shard number: during shard splits we might
|
||||
// have multiple shards with the old shard count.
|
||||
if want_shard.is_none()
|
||||
|| want_shard.unwrap().shard_count != tenant.shard_identity.count
|
||||
{
|
||||
want_shard = Some(ShardIndex {
|
||||
shard_number: tenant.shard_identity.get_shard_number(&key),
|
||||
shard_count: tenant.shard_identity.count,
|
||||
});
|
||||
}
|
||||
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
if Some(ShardIndex {
|
||||
shard_number: tenant.shard_identity.number,
|
||||
shard_count: tenant.shard_identity.count,
|
||||
}) == want_shard
|
||||
{
|
||||
return ShardResolveResult::Found(tenant.clone());
|
||||
}
|
||||
}
|
||||
@@ -2891,14 +2914,18 @@ mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tracing::Instrument;
|
||||
|
||||
use super::super::harness::TenantHarness;
|
||||
use super::TenantsMap;
|
||||
use crate::tenant::{
|
||||
TenantSharedResources,
|
||||
mgr::{BackgroundPurges, TenantManager, TenantSlot},
|
||||
use crate::{
|
||||
basebackup_cache::BasebackupCache,
|
||||
tenant::{
|
||||
TenantSharedResources,
|
||||
mgr::{BackgroundPurges, TenantManager, TenantSlot},
|
||||
},
|
||||
};
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
@@ -2924,9 +2951,7 @@ mod tests {
|
||||
// Invoke remove_tenant_from_memory with a cleanup hook that blocks until we manually
|
||||
// permit it to proceed: that will stick the tenant in InProgress
|
||||
|
||||
let (basebackup_prepare_sender, _) = tokio::sync::mpsc::unbounded_channel::<
|
||||
crate::basebackup_cache::BasebackupPrepareRequest,
|
||||
>();
|
||||
let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);
|
||||
|
||||
let tenant_manager = TenantManager {
|
||||
tenants: std::sync::RwLock::new(TenantsMap::Open(tenants)),
|
||||
@@ -2940,7 +2965,7 @@ mod tests {
|
||||
l0_flush_global_state: crate::l0_flush::L0FlushGlobalState::new(
|
||||
h.conf.l0_flush.clone(),
|
||||
),
|
||||
basebackup_prepare_sender,
|
||||
basebackup_cache,
|
||||
feature_resolver: crate::feature_resolver::FeatureResolver::new_disabled(),
|
||||
},
|
||||
cancel: tokio_util::sync::CancellationToken::new(),
|
||||
|
||||
@@ -101,7 +101,7 @@ pub(crate) struct SecondaryTenant {
|
||||
// Secondary mode does not need the full shard identity or the pageserver_api::models::TenantConfig. However,
|
||||
// storing these enables us to report our full LocationConf, enabling convenient reconciliation
|
||||
// by the control plane (see [`Self::get_location_conf`])
|
||||
shard_identity: ShardIdentity,
|
||||
pub(crate) shard_identity: ShardIdentity,
|
||||
tenant_conf: std::sync::Mutex<pageserver_api::models::TenantConfig>,
|
||||
|
||||
// Internal state used by the Downloader.
|
||||
|
||||
@@ -55,11 +55,11 @@ pub struct BatchLayerWriter {
|
||||
}
|
||||
|
||||
impl BatchLayerWriter {
|
||||
pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
pub fn new(conf: &'static PageServerConf) -> Self {
|
||||
Self {
|
||||
generated_layer_writers: Vec::new(),
|
||||
conf,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_unfinished_image_writer(
|
||||
@@ -209,6 +209,7 @@ impl<'a> SplitImageLayerWriter<'a> {
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
// XXX make this lazy like in SplitDeltaLayerWriter?
|
||||
inner: ImageLayerWriter::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
@@ -223,7 +224,7 @@ impl<'a> SplitImageLayerWriter<'a> {
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
batches: BatchLayerWriter::new(conf).await?,
|
||||
batches: BatchLayerWriter::new(conf),
|
||||
lsn,
|
||||
start_key,
|
||||
gate,
|
||||
@@ -319,7 +320,7 @@ pub struct SplitDeltaLayerWriter<'a> {
|
||||
}
|
||||
|
||||
impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
pub async fn new(
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -327,8 +328,8 @@ impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
target_layer_size: u64,
|
||||
gate: &'a utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
) -> Self {
|
||||
Self {
|
||||
target_layer_size,
|
||||
inner: None,
|
||||
conf,
|
||||
@@ -336,10 +337,10 @@ impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
tenant_shard_id,
|
||||
lsn_range,
|
||||
last_key_written: Key::MIN,
|
||||
batches: BatchLayerWriter::new(conf).await?,
|
||||
batches: BatchLayerWriter::new(conf),
|
||||
gate,
|
||||
cancel,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn put_value(
|
||||
@@ -510,9 +511,7 @@ mod tests {
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
);
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &ctx)
|
||||
@@ -590,9 +589,7 @@ mod tests {
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
);
|
||||
const N: usize = 2000;
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
@@ -692,9 +689,7 @@ mod tests {
|
||||
4 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
);
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &ctx)
|
||||
@@ -770,9 +765,7 @@ mod tests {
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
);
|
||||
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
|
||||
@@ -17,14 +17,17 @@ use tracing::*;
|
||||
use utils::backoff::exponential_backoff_duration;
|
||||
use utils::completion::Barrier;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::sync::gate::GateError;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
|
||||
use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
|
||||
use crate::tenant::blob_io::WriteBlobError;
|
||||
use crate::tenant::throttle::Stats;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::compaction::CompactionOutcome;
|
||||
use crate::tenant::{TenantShard, TenantState};
|
||||
use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
|
||||
|
||||
/// Semaphore limiting concurrent background tasks (across all tenants).
|
||||
///
|
||||
@@ -313,7 +316,20 @@ pub(crate) fn log_compaction_error(
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let is_stopping = upload_queue || timeline;
|
||||
let buffered_writer_flush_task_canelled = root_cause
|
||||
.downcast_ref::<FlushTaskError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let write_blob_cancelled = root_cause
|
||||
.downcast_ref::<WriteBlobError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let gate_closed = root_cause
|
||||
.downcast_ref::<GateError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let is_stopping = upload_queue
|
||||
|| timeline
|
||||
|| buffered_writer_flush_task_canelled
|
||||
|| write_blob_cancelled
|
||||
|| gate_closed;
|
||||
|
||||
if is_stopping {
|
||||
Level::INFO
|
||||
|
||||
@@ -95,12 +95,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
|
||||
use super::tasks::log_compaction_error;
|
||||
use super::upload_queue::NotInitialized;
|
||||
use super::{
|
||||
AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded,
|
||||
AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
};
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use crate::aux_file::AuxFileSizeEstimator;
|
||||
use crate::basebackup_cache::BasebackupPrepareRequest;
|
||||
use crate::basebackup_cache::BasebackupCache;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{
|
||||
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
@@ -201,7 +201,7 @@ pub struct TimelineResources {
|
||||
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
pub l0_compaction_trigger: Arc<Notify>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
pub basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
pub basebackup_cache: Arc<BasebackupCache>,
|
||||
pub feature_resolver: FeatureResolver,
|
||||
}
|
||||
|
||||
@@ -448,7 +448,7 @@ pub struct Timeline {
|
||||
wait_lsn_log_slow: tokio::sync::Semaphore,
|
||||
|
||||
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
|
||||
basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
|
||||
feature_resolver: FeatureResolver,
|
||||
}
|
||||
@@ -763,7 +763,7 @@ pub(crate) enum CreateImageLayersError {
|
||||
PageReconstructError(#[source] PageReconstructError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<layer_manager::Shutdown> for CreateImageLayersError {
|
||||
@@ -2500,6 +2500,37 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled)
|
||||
}
|
||||
|
||||
/// Try to get a basebackup from the on-disk cache.
|
||||
pub(crate) async fn get_cached_basebackup(&self, lsn: Lsn) -> Option<tokio::fs::File> {
|
||||
self.basebackup_cache
|
||||
.get(self.tenant_shard_id.tenant_id, self.timeline_id, lsn)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Convenience method to attempt fetching a basebackup for the timeline if enabled and safe for
|
||||
/// the given request parameters.
|
||||
///
|
||||
/// TODO: consider moving this onto GrpcPageServiceHandler once the libpq handler is gone.
|
||||
pub async fn get_cached_basebackup_if_enabled(
|
||||
&self,
|
||||
lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full: bool,
|
||||
replica: bool,
|
||||
gzip: bool,
|
||||
) -> Option<tokio::fs::File> {
|
||||
if !self.is_basebackup_cache_enabled() || !self.basebackup_cache.is_enabled() {
|
||||
return None;
|
||||
}
|
||||
// We have to know which LSN to fetch the basebackup for.
|
||||
let lsn = lsn?;
|
||||
// We only cache gzipped, non-full basebackups for primary computes with automatic prev_lsn.
|
||||
if prev_lsn.is_some() || full || replica || !gzip {
|
||||
return None;
|
||||
}
|
||||
self.get_cached_basebackup(lsn).await
|
||||
}
|
||||
|
||||
/// Prepare basebackup for the given LSN and store it in the basebackup cache.
|
||||
/// The method is asynchronous and returns immediately.
|
||||
/// The actual basebackup preparation is performed in the background
|
||||
@@ -2521,17 +2552,8 @@ impl Timeline {
|
||||
return;
|
||||
}
|
||||
|
||||
let res = self
|
||||
.basebackup_prepare_sender
|
||||
.send(BasebackupPrepareRequest {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
timeline_id: self.timeline_id,
|
||||
lsn,
|
||||
});
|
||||
if let Err(e) = res {
|
||||
// May happen during shutdown, it's not critical.
|
||||
info!("Failed to send shutdown checkpoint: {e:#}");
|
||||
}
|
||||
self.basebackup_cache
|
||||
.send_prepare(self.tenant_shard_id, self.timeline_id, lsn);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3088,7 +3110,7 @@ impl Timeline {
|
||||
|
||||
wait_lsn_log_slow: tokio::sync::Semaphore::new(1),
|
||||
|
||||
basebackup_prepare_sender: resources.basebackup_prepare_sender,
|
||||
basebackup_cache: resources.basebackup_cache,
|
||||
|
||||
feature_resolver: resources.feature_resolver,
|
||||
};
|
||||
@@ -4658,6 +4680,16 @@ impl Timeline {
|
||||
mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
// Always notify waiters about the flush loop exiting since the loop might stop
|
||||
// when the timeline hasn't been cancelled.
|
||||
let scopeguard_rx = layer_flush_start_rx.clone();
|
||||
scopeguard::defer! {
|
||||
let (flush_counter, _) = *scopeguard_rx.borrow();
|
||||
let _ = self
|
||||
.layer_flush_done_tx
|
||||
.send_replace((flush_counter, Err(FlushLayerError::Cancelled)));
|
||||
}
|
||||
|
||||
// Subscribe to L0 delta layer updates, for compaction backpressure.
|
||||
let mut watch_l0 = match self
|
||||
.layers
|
||||
@@ -4687,9 +4719,6 @@ impl Timeline {
|
||||
let result = loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
info!("dropping out of flush loop for timeline shutdown");
|
||||
// Note: we do not bother transmitting into [`layer_flush_done_tx`], because
|
||||
// anyone waiting on that will respect self.cancel as well: they will stop
|
||||
// waiting at the same time we as drop out of this loop.
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -5561,7 +5590,7 @@ impl Timeline {
|
||||
self.should_check_if_image_layers_required(lsn)
|
||||
};
|
||||
|
||||
let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
|
||||
let mut batch_image_writer = BatchLayerWriter::new(self.conf);
|
||||
|
||||
let mut all_generated = true;
|
||||
|
||||
@@ -5665,7 +5694,8 @@ impl Timeline {
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(CreateImageLayersError::Other)?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
@@ -5760,7 +5790,10 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
let image_layers = batch_image_writer.finish(self, ctx).await?;
|
||||
let image_layers = batch_image_writer
|
||||
.finish(self, ctx)
|
||||
.await
|
||||
.map_err(CreateImageLayersError::Other)?;
|
||||
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;
|
||||
|
||||
|
||||
@@ -3531,10 +3531,7 @@ impl Timeline {
|
||||
self.get_compaction_target_size(),
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.context("failed to create delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
);
|
||||
|
||||
#[derive(Default)]
|
||||
struct RewritingLayers {
|
||||
@@ -4330,7 +4327,8 @@ impl TimelineAdaptor {
|
||||
self.timeline.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(CreateImageLayersError::Other)?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
@@ -4339,7 +4337,10 @@ impl TimelineAdaptor {
|
||||
});
|
||||
|
||||
let keyspace = KeySpace {
|
||||
ranges: self.get_keyspace(key_range, lsn, ctx).await?,
|
||||
ranges: self
|
||||
.get_keyspace(key_range, lsn, ctx)
|
||||
.await
|
||||
.map_err(CreateImageLayersError::Other)?,
|
||||
};
|
||||
// TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
|
||||
let outcome = self
|
||||
@@ -4358,9 +4359,13 @@ impl TimelineAdaptor {
|
||||
unfinished_image_layer,
|
||||
} = outcome
|
||||
{
|
||||
let (desc, path) = unfinished_image_layer.finish(ctx).await?;
|
||||
let (desc, path) = unfinished_image_layer
|
||||
.finish(ctx)
|
||||
.await
|
||||
.map_err(CreateImageLayersError::Other)?;
|
||||
let image_layer =
|
||||
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
|
||||
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)
|
||||
.map_err(CreateImageLayersError::Other)?;
|
||||
self.new_images.push(image_layer);
|
||||
}
|
||||
|
||||
|
||||
@@ -241,8 +241,17 @@ impl DeleteTimelineFlow {
|
||||
{
|
||||
Ok(r) => r,
|
||||
Err(DownloadError::NotFound) => {
|
||||
// Deletion is already complete
|
||||
// Deletion is already complete.
|
||||
// As we came here, we will need to remove the timeline from the tenant though.
|
||||
tracing::info!("Timeline already deleted in remote storage");
|
||||
if let TimelineOrOffloaded::Offloaded(_) = &timeline {
|
||||
// We only supoprt this for offloaded timelines, as we don't know which state non-offloaded timelines are in.
|
||||
tracing::info!(
|
||||
"Timeline with gone index part is offloaded timeline. Removing from tenant."
|
||||
);
|
||||
remove_maybe_offloaded_timeline_from_tenant(tenant, &timeline, &guard)
|
||||
.await?;
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -885,7 +885,7 @@ async fn remote_copy(
|
||||
}
|
||||
tracing::info!("Deleting orphan layer file to make way for hard linking");
|
||||
// Delete orphan layer file and try again, to ensure this layer has a well understood source
|
||||
std::fs::remove_file(adopted_path)
|
||||
std::fs::remove_file(&adoptee_path)
|
||||
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;
|
||||
std::fs::hard_link(adopted_path, &adoptee_path)
|
||||
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;
|
||||
|
||||
@@ -887,7 +887,7 @@ mod tests {
|
||||
.expect("we still have it");
|
||||
}
|
||||
|
||||
fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
|
||||
fn make_relation_key_for_shard(shard: ShardNumber, params: ShardParameters) -> Key {
|
||||
rel_block_to_key(
|
||||
RelTag {
|
||||
spcnode: 1663,
|
||||
@@ -917,14 +917,14 @@ mod tests {
|
||||
let child0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
|
||||
shard: ShardIdentity::from_params(ShardNumber(0), child_params),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let child1 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
|
||||
shard: ShardIdentity::from_params(ShardNumber(1), child_params),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
@@ -937,7 +937,7 @@ mod tests {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
|
||||
&StubManager {
|
||||
shards: vec![parent.clone()],
|
||||
},
|
||||
@@ -961,7 +961,7 @@ mod tests {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
|
||||
&StubManager {
|
||||
shards: vec![], // doesn't matter what's in here, the cache is fully loaded
|
||||
},
|
||||
@@ -978,7 +978,7 @@ mod tests {
|
||||
let parent_handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), child_params)),
|
||||
&StubManager {
|
||||
shards: vec![parent.clone()],
|
||||
},
|
||||
@@ -995,7 +995,7 @@ mod tests {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
|
||||
&StubManager {
|
||||
shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
|
||||
},
|
||||
|
||||
@@ -275,12 +275,20 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let copy_stream = replication_client.copy_both_simple(&query).await?;
|
||||
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx);
|
||||
let walingest_res = select! {
|
||||
walingest_res = walingest_future => walingest_res,
|
||||
_ = cancellation.cancelled() => {
|
||||
// We are doing reads in WalIngest::new, and those can hang as they come from the network.
|
||||
// Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one.
|
||||
debug!("Connection cancelled");
|
||||
return Err(WalReceiverError::Cancelled);
|
||||
},
|
||||
};
|
||||
let mut walingest = walingest_res.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
let (format, compression) = match protocol {
|
||||
PostgresClientProtocol::Interpreted {
|
||||
|
||||
@@ -1295,7 +1295,8 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
if (iteration_hits != 0)
|
||||
{
|
||||
/* chunk offset (# of pages) into the LFC file */
|
||||
/* chunk offset (#
|
||||
of pages) into the LFC file */
|
||||
off_t first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk;
|
||||
int nwrite = iov_last_used - first_block_in_chunk_read;
|
||||
/* offset of first IOV */
|
||||
@@ -1313,16 +1314,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
lfc_disable("read");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* We successfully read the pages we know were valid when we
|
||||
* started reading; now mark those pages as read
|
||||
*/
|
||||
for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
|
||||
{
|
||||
if (BITMAP_ISSET(chunk_mask, i))
|
||||
BITMAP_SET(mask, buf_offset + i);
|
||||
}
|
||||
}
|
||||
|
||||
/* Place entry to the head of LRU list */
|
||||
@@ -1340,6 +1331,15 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
lfc_ctl->time_read += io_time_us;
|
||||
inc_page_cache_read_wait(io_time_us);
|
||||
/*
|
||||
* We successfully read the pages we know were valid when we
|
||||
* started reading; now mark those pages as read
|
||||
*/
|
||||
for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
|
||||
{
|
||||
if (BITMAP_ISSET(chunk_mask, i))
|
||||
BITMAP_SET(mask, buf_offset + i);
|
||||
}
|
||||
}
|
||||
|
||||
CriticalAssert(entry->access_count > 0);
|
||||
|
||||
121
postgres.mk
Normal file
121
postgres.mk
Normal file
@@ -0,0 +1,121 @@
|
||||
# Sub-makefile for compiling PostgreSQL as part of Neon. This is
|
||||
# included from the main Makefile, and is not meant to be called
|
||||
# directly.
|
||||
#
|
||||
# CI workflows and Dockerfiles can take advantage of the following
|
||||
# properties for caching:
|
||||
#
|
||||
# - Compiling the targets in this file only builds the PostgreSQL sources
|
||||
# under the vendor/ subdirectory, nothing else from the repository.
|
||||
# - All outputs go to POSTGRES_INSTALL_DIR (by default 'pg_install',
|
||||
# see parent Makefile)
|
||||
# - intermediate build artifacts go to BUILD_DIR
|
||||
#
|
||||
#
|
||||
# Variables passed from the parent Makefile that control what gets
|
||||
# installed and where:
|
||||
# - POSTGRES_VERSIONS
|
||||
# - POSTGRES_INSTALL_DIR
|
||||
# - BUILD_DIR
|
||||
#
|
||||
# Variables passed from the parent Makefile that affect the build
|
||||
# process and the resulting binaries:
|
||||
# - PG_CONFIGURE_OPTS
|
||||
# - PG_CFLAGS
|
||||
# - PG_LDFLAGS
|
||||
# - EXTRA_PATH_OVERRIDES
|
||||
|
||||
###
|
||||
### Main targets
|
||||
###
|
||||
### These are called from the main Makefile, and can also be called
|
||||
### directly from command line
|
||||
|
||||
# Compile and install a specific PostgreSQL version
|
||||
postgres-install-%: postgres-configure-% \
|
||||
postgres-headers-install-% # to prevent `make install` conflicts with neon's `postgres-headers`
|
||||
|
||||
# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
|
||||
#
|
||||
# This is implicitly part of the 'postgres-install-%' target, but this can be handy
|
||||
# if you want to install just the headers without building PostgreSQL, e.g. for building
|
||||
# extensions.
|
||||
postgres-headers-install-%: postgres-configure-%
|
||||
+@echo "Installing PostgreSQL $* headers"
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install
|
||||
|
||||
# Run Postgres regression tests
|
||||
postgres-check-%: postgres-install-%
|
||||
$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check
|
||||
|
||||
###
|
||||
### Shorthands for the main targets, for convenience
|
||||
###
|
||||
|
||||
# Same as the above main targets, but for all supported PostgreSQL versions
|
||||
# For example, 'make postgres-install' is equivalent to
|
||||
# 'make postgres-install-v14 postgres-install-v15 postgres-install-v16 postgres-install-v17'
|
||||
all_version_targets=postgres-install postgres-headers-install postgres-check
|
||||
.PHONY: $(all_version_targets)
|
||||
$(all_version_targets): postgres-%: $(foreach pg_version,$(POSTGRES_VERSIONS),postgres-%-$(pg_version))
|
||||
|
||||
.PHONY: postgres
|
||||
postgres: postgres-install
|
||||
|
||||
.PHONY: postgres-headers
|
||||
postgres-headers: postgres-headers-install
|
||||
|
||||
# 'postgres-v17' is an alias for 'postgres-install-v17' etc.
|
||||
$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-$(pg_version)): postgres-%: postgres-install-%
|
||||
|
||||
###
|
||||
### Intermediate targets
|
||||
###
|
||||
### These are not intended to be called directly, but are dependencies for the
|
||||
### main targets.
|
||||
|
||||
# Run 'configure'
|
||||
$(BUILD_DIR)/%/config.status:
|
||||
mkdir -p $(BUILD_DIR)
|
||||
test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG
|
||||
|
||||
+@echo "Configuring Postgres $* build"
|
||||
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
|
||||
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
|
||||
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
|
||||
exit 1; }
|
||||
mkdir -p $(BUILD_DIR)/$*
|
||||
|
||||
VERSION=$*; \
|
||||
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
|
||||
(cd $(BUILD_DIR)/$$VERSION && \
|
||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
|
||||
CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
|
||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
|
||||
|
||||
# nicer alias to run 'configure'.
|
||||
#
|
||||
# This tries to accomplish this rule:
|
||||
#
|
||||
# postgres-configure-%: $(BUILD_DIR)/%/config.status
|
||||
#
|
||||
# XXX: I'm not sure why the above rule doesn't work directly. But this accomplishses
|
||||
# the same thing
|
||||
$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-configure-$(pg_version)): postgres-configure-%: FORCE $(BUILD_DIR)/%/config.status
|
||||
|
||||
# Compile and install PostgreSQL (and a few contrib modules used in tests)
|
||||
postgres-install-%: postgres-configure-% \
|
||||
postgres-headers-install-% # to prevent `make install` conflicts with neon's `postgres-headers-install`
|
||||
+@echo "Compiling PostgreSQL $*"
|
||||
$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install
|
||||
$(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install
|
||||
|
||||
.PHONY: FORCE
|
||||
FORCE:
|
||||
@@ -6,7 +6,6 @@ use std::collections::BTreeMap;
|
||||
use std::pin::pin;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use futures::future::Either;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio::sync::oneshot::error::TryRecvError;
|
||||
|
||||
@@ -49,37 +48,67 @@ impl<P: QueueProcessing> BatchQueue<P> {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn call(&self, req: P::Req) -> P::Res {
|
||||
/// Perform a single request-response process, this may be batched internally.
|
||||
///
|
||||
/// This function is not cancel safe.
|
||||
pub async fn call<R>(
|
||||
&self,
|
||||
req: P::Req,
|
||||
cancelled: impl Future<Output = R>,
|
||||
) -> Result<P::Res, R> {
|
||||
let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req);
|
||||
let guard = scopeguard::guard(id, move |id| {
|
||||
let mut inner = self.inner.lock_propagate_poison();
|
||||
if inner.queue.remove(&id).is_some() {
|
||||
tracing::debug!("batched task cancelled before completion");
|
||||
}
|
||||
});
|
||||
|
||||
let mut cancelled = pin!(cancelled);
|
||||
let resp = loop {
|
||||
// try become the leader, or try wait for success.
|
||||
let mut processor = match futures::future::select(rx, pin!(self.processor.lock())).await
|
||||
{
|
||||
// we got the resp.
|
||||
Either::Left((resp, _)) => break resp.ok(),
|
||||
// we are the leader.
|
||||
Either::Right((p, rx_)) => {
|
||||
rx = rx_;
|
||||
p
|
||||
}
|
||||
let mut processor = tokio::select! {
|
||||
// try become leader.
|
||||
p = self.processor.lock() => p,
|
||||
// wait for success.
|
||||
resp = &mut rx => break resp.ok(),
|
||||
// wait for cancellation.
|
||||
cancel = cancelled.as_mut() => {
|
||||
let mut inner = self.inner.lock_propagate_poison();
|
||||
if inner.queue.remove(&id).is_some() {
|
||||
tracing::warn!("batched task cancelled before completion");
|
||||
}
|
||||
return Err(cancel);
|
||||
},
|
||||
};
|
||||
|
||||
tracing::debug!(id, "batch: became leader");
|
||||
let (reqs, resps) = self.inner.lock_propagate_poison().get_batch(&processor);
|
||||
|
||||
// snitch incase the task gets cancelled.
|
||||
let cancel_safety = scopeguard::guard((), |()| {
|
||||
if !std::thread::panicking() {
|
||||
tracing::error!(
|
||||
id,
|
||||
"batch: leader cancelled, despite not being cancellation safe"
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// apply a batch.
|
||||
// if this is cancelled, jobs will not be completed and will panic.
|
||||
let values = processor.apply(reqs).await;
|
||||
|
||||
// good: we didn't get cancelled.
|
||||
ScopeGuard::into_inner(cancel_safety);
|
||||
|
||||
if values.len() != resps.len() {
|
||||
tracing::error!(
|
||||
"batch: invalid response size, expected={}, got={}",
|
||||
resps.len(),
|
||||
values.len()
|
||||
);
|
||||
}
|
||||
|
||||
// send response values.
|
||||
for (tx, value) in std::iter::zip(resps, values) {
|
||||
// sender hung up but that's fine.
|
||||
drop(tx.send(value));
|
||||
if tx.send(value).is_err() {
|
||||
// receiver hung up but that's fine.
|
||||
}
|
||||
}
|
||||
|
||||
match rx.try_recv() {
|
||||
@@ -98,10 +127,9 @@ impl<P: QueueProcessing> BatchQueue<P> {
|
||||
}
|
||||
};
|
||||
|
||||
// already removed.
|
||||
ScopeGuard::into_inner(guard);
|
||||
tracing::debug!(id, "batch: job completed");
|
||||
|
||||
resp.expect("no response found. batch processer should not panic")
|
||||
Ok(resp.expect("no response found. batch processer should not panic"))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,6 +153,8 @@ impl<P: QueueProcessing> BatchQueueInner<P> {
|
||||
|
||||
self.queue.insert(id, BatchJob { req, res: tx });
|
||||
|
||||
tracing::debug!(id, "batch: registered job in the queue");
|
||||
|
||||
(id, rx)
|
||||
}
|
||||
|
||||
@@ -132,15 +162,19 @@ impl<P: QueueProcessing> BatchQueueInner<P> {
|
||||
let batch_size = p.batch_size(self.queue.len());
|
||||
let mut reqs = Vec::with_capacity(batch_size);
|
||||
let mut resps = Vec::with_capacity(batch_size);
|
||||
let mut ids = Vec::with_capacity(batch_size);
|
||||
|
||||
while reqs.len() < batch_size {
|
||||
let Some((_, job)) = self.queue.pop_first() else {
|
||||
let Some((id, job)) = self.queue.pop_first() else {
|
||||
break;
|
||||
};
|
||||
reqs.push(job.req);
|
||||
resps.push(job.res);
|
||||
ids.push(id);
|
||||
}
|
||||
|
||||
tracing::debug!(ids=?ids, "batch: acquired jobs");
|
||||
|
||||
(reqs, resps)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -279,7 +279,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
|
||||
},
|
||||
proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
|
||||
handshake_timeout: Duration::from_secs(10),
|
||||
region: "local".into(),
|
||||
wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
|
||||
connect_compute_locks,
|
||||
connect_to_compute: compute_config,
|
||||
|
||||
@@ -26,9 +26,10 @@ use utils::sentry_init::init_sentry;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::{Metrics, ThreadPoolMetrics};
|
||||
use crate::pglb::TlsRequired;
|
||||
use crate::pqproto::FeStartupPacket;
|
||||
use crate::protocol2::ConnectionInfo;
|
||||
use crate::proxy::{ErrorSource, TlsRequired, copy_bidirectional_client_compute};
|
||||
use crate::proxy::{ErrorSource, copy_bidirectional_client_compute};
|
||||
use crate::stream::{PqStream, Stream};
|
||||
use crate::util::run_until_cancelled;
|
||||
|
||||
@@ -236,7 +237,6 @@ pub(super) async fn task_main(
|
||||
extra: None,
|
||||
},
|
||||
crate::metrics::Protocol::SniRouter,
|
||||
"sni",
|
||||
);
|
||||
handle_client(ctx, dest_suffix, tls_config, compute_tls_config, socket).await
|
||||
}
|
||||
|
||||
@@ -123,12 +123,6 @@ struct ProxyCliArgs {
|
||||
/// timeout for the TLS handshake
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
handshake_timeout: tokio::time::Duration,
|
||||
/// http endpoint to receive periodic metric updates
|
||||
#[clap(long)]
|
||||
metric_collection_endpoint: Option<String>,
|
||||
/// how often metrics should be sent to a collection endpoint
|
||||
#[clap(long)]
|
||||
metric_collection_interval: Option<String>,
|
||||
/// cache for `wake_compute` api method (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
wake_compute_cache: String,
|
||||
@@ -155,40 +149,31 @@ struct ProxyCliArgs {
|
||||
/// Wake compute rate limiter max number of requests per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
|
||||
wake_compute_limit: Vec<RateBucketInfo>,
|
||||
/// Redis rate limiter max number of requests per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
|
||||
redis_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Cancellation channel size (max queue size for redis kv client)
|
||||
#[clap(long, default_value_t = 1024)]
|
||||
cancellation_ch_size: usize,
|
||||
/// Cancellation ops batch size for redis
|
||||
#[clap(long, default_value_t = 8)]
|
||||
cancellation_batch_size: usize,
|
||||
/// cache for `allowed_ips` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
allowed_ips_cache: String,
|
||||
/// cache for `role_secret` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
role_secret_cache: String,
|
||||
/// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
|
||||
#[clap(long)]
|
||||
redis_notifications: Option<String>,
|
||||
/// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
|
||||
/// redis url for plain authentication
|
||||
#[clap(long, alias("redis-notifications"))]
|
||||
redis_plain: Option<String>,
|
||||
/// what from the available authentications type to use for redis. Supported are "irsa" and "plain".
|
||||
#[clap(long, default_value = "irsa")]
|
||||
redis_auth_type: String,
|
||||
/// redis host for streaming connections (might be different from the notifications host)
|
||||
/// redis host for irsa authentication
|
||||
#[clap(long)]
|
||||
redis_host: Option<String>,
|
||||
/// redis port for streaming connections (might be different from the notifications host)
|
||||
/// redis port for irsa authentication
|
||||
#[clap(long)]
|
||||
redis_port: Option<u16>,
|
||||
/// redis cluster name, used in aws elasticache
|
||||
/// redis cluster name for irsa authentication
|
||||
#[clap(long)]
|
||||
redis_cluster_name: Option<String>,
|
||||
/// redis user_id, used in aws elasticache
|
||||
/// redis user_id for irsa authentication
|
||||
#[clap(long)]
|
||||
redis_user_id: Option<String>,
|
||||
/// aws region to retrieve credentials
|
||||
/// aws region for irsa authentication
|
||||
#[clap(long, default_value_t = String::new())]
|
||||
aws_region: String,
|
||||
/// cache for `project_info` (use `size=0` to disable)
|
||||
@@ -200,6 +185,12 @@ struct ProxyCliArgs {
|
||||
#[clap(flatten)]
|
||||
parquet_upload: ParquetUploadArgs,
|
||||
|
||||
/// http endpoint to receive periodic metric updates
|
||||
#[clap(long)]
|
||||
metric_collection_endpoint: Option<String>,
|
||||
/// how often metrics should be sent to a collection endpoint
|
||||
#[clap(long)]
|
||||
metric_collection_interval: Option<String>,
|
||||
/// interval for backup metric collection
|
||||
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
|
||||
metric_backup_collection_interval: std::time::Duration,
|
||||
@@ -212,6 +203,7 @@ struct ProxyCliArgs {
|
||||
/// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
|
||||
#[clap(long, default_value = "4194304")]
|
||||
metric_backup_collection_chunk_size: usize,
|
||||
|
||||
/// Whether to retry the connection to the compute node
|
||||
#[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
|
||||
connect_to_compute_retry: String,
|
||||
@@ -331,7 +323,7 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
|
||||
}
|
||||
info!("Using region: {}", args.aws_region);
|
||||
let (regional_redis_client, redis_notifications_client) = configure_redis(&args).await?;
|
||||
let redis_client = configure_redis(&args).await?;
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
info!("Starting http on {}", args.http);
|
||||
@@ -386,13 +378,6 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
|
||||
let cancellation_token = CancellationToken::new();
|
||||
|
||||
let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
|
||||
RateBucketInfo::validate(redis_rps_limit)?;
|
||||
|
||||
let redis_kv_client = regional_redis_client
|
||||
.as_ref()
|
||||
.map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));
|
||||
|
||||
let cancellation_handler = Arc::new(CancellationHandler::new(&config.connect_to_compute));
|
||||
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
|
||||
@@ -407,7 +392,7 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
match auth_backend {
|
||||
Either::Left(auth_backend) => {
|
||||
if let Some(proxy_listener) = proxy_listener {
|
||||
client_tasks.spawn(crate::proxy::task_main(
|
||||
client_tasks.spawn(crate::pglb::task_main(
|
||||
config,
|
||||
auth_backend,
|
||||
proxy_listener,
|
||||
@@ -472,6 +457,7 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
client_tasks.spawn(crate::context::parquet::worker(
|
||||
cancellation_token.clone(),
|
||||
args.parquet_upload,
|
||||
args.region,
|
||||
));
|
||||
|
||||
// maintenance tasks. these never return unless there's an error
|
||||
@@ -495,32 +481,17 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
#[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))]
|
||||
if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend {
|
||||
if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
|
||||
match (redis_notifications_client, regional_redis_client.clone()) {
|
||||
(None, None) => {}
|
||||
(client1, client2) => {
|
||||
let cache = api.caches.project_info.clone();
|
||||
if let Some(client) = client1 {
|
||||
maintenance_tasks.spawn(notifications::task_main(
|
||||
client,
|
||||
cache.clone(),
|
||||
args.region.clone(),
|
||||
));
|
||||
}
|
||||
if let Some(client) = client2 {
|
||||
maintenance_tasks.spawn(notifications::task_main(
|
||||
client,
|
||||
cache.clone(),
|
||||
args.region.clone(),
|
||||
));
|
||||
}
|
||||
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
||||
}
|
||||
}
|
||||
if let Some(client) = redis_client {
|
||||
// project info cache and invalidation of that cache.
|
||||
let cache = api.caches.project_info.clone();
|
||||
maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone()));
|
||||
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
||||
|
||||
// Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
|
||||
// This prevents immediate exit and pod restart,
|
||||
// which can cause hammering of the redis in case of connection issues.
|
||||
if let Some(mut redis_kv_client) = redis_kv_client {
|
||||
// Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
|
||||
// This prevents immediate exit and pod restart,
|
||||
// which can cause hammering of the redis in case of connection issues.
|
||||
// cancellation key management
|
||||
let mut redis_kv_client = RedisKVClient::new(client.clone());
|
||||
for attempt in (0..3).with_position() {
|
||||
match redis_kv_client.try_connect().await {
|
||||
Ok(()) => {
|
||||
@@ -545,14 +516,12 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(regional_redis_client) = regional_redis_client {
|
||||
// listen for notifications of new projects/endpoints/branches
|
||||
let cache = api.caches.endpoints_cache.clone();
|
||||
let con = regional_redis_client;
|
||||
let span = tracing::info_span!("endpoints_cache");
|
||||
maintenance_tasks.spawn(
|
||||
async move { cache.do_read(con, cancellation_token.clone()).await }
|
||||
async move { cache.do_read(client, cancellation_token.clone()).await }
|
||||
.instrument(span),
|
||||
);
|
||||
}
|
||||
@@ -681,7 +650,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
authentication_config,
|
||||
proxy_protocol_v2: args.proxy_protocol_v2,
|
||||
handshake_timeout: args.handshake_timeout,
|
||||
region: args.region.clone(),
|
||||
wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
|
||||
connect_compute_locks,
|
||||
connect_to_compute: compute_config,
|
||||
@@ -843,21 +811,18 @@ fn build_auth_backend(
|
||||
|
||||
async fn configure_redis(
|
||||
args: &ProxyCliArgs,
|
||||
) -> anyhow::Result<(
|
||||
Option<ConnectionWithCredentialsProvider>,
|
||||
Option<ConnectionWithCredentialsProvider>,
|
||||
)> {
|
||||
) -> anyhow::Result<Option<ConnectionWithCredentialsProvider>> {
|
||||
// TODO: untangle the config args
|
||||
let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
|
||||
("plain", redis_url) => match redis_url {
|
||||
let redis_client = match &*args.redis_auth_type {
|
||||
"plain" => match &args.redis_plain {
|
||||
None => {
|
||||
bail!("plain auth requires redis_notifications to be set");
|
||||
bail!("plain auth requires redis_plain to be set");
|
||||
}
|
||||
Some(url) => {
|
||||
Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone()))
|
||||
}
|
||||
},
|
||||
("irsa", _) => match (&args.redis_host, args.redis_port) {
|
||||
"irsa" => match (&args.redis_host, args.redis_port) {
|
||||
(Some(host), Some(port)) => Some(
|
||||
ConnectionWithCredentialsProvider::new_with_credentials_provider(
|
||||
host.clone(),
|
||||
@@ -881,18 +846,12 @@ async fn configure_redis(
|
||||
bail!("redis-host and redis-port must be specified together");
|
||||
}
|
||||
},
|
||||
_ => {
|
||||
bail!("unknown auth type given");
|
||||
auth_type => {
|
||||
bail!("unknown auth type {auth_type:?} given")
|
||||
}
|
||||
};
|
||||
|
||||
let redis_notifications_client = if let Some(url) = &args.redis_notifications {
|
||||
Some(ConnectionWithCredentialsProvider::new_with_static_credentials(&**url))
|
||||
} else {
|
||||
regional_redis_client.clone()
|
||||
};
|
||||
|
||||
Ok((regional_redis_client, redis_notifications_client))
|
||||
Ok(redis_client)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
63
proxy/src/cache/timed_lru.rs
vendored
63
proxy/src/cache/timed_lru.rs
vendored
@@ -30,7 +30,7 @@ use super::{Cache, timed_lru};
|
||||
///
|
||||
/// * There's an API for immediate invalidation (removal) of a cache entry;
|
||||
/// It's useful in case we know for sure that the entry is no longer correct.
|
||||
/// See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information.
|
||||
/// See [`timed_lru::Cached`] for more information.
|
||||
///
|
||||
/// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
|
||||
/// or by a successful lookup (i.e. the entry hasn't expired yet).
|
||||
@@ -54,7 +54,7 @@ pub(crate) struct TimedLru<K, V> {
|
||||
impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
|
||||
type Key = K;
|
||||
type Value = V;
|
||||
type LookupInfo<Key> = LookupInfo<Key>;
|
||||
type LookupInfo<Key> = Key;
|
||||
|
||||
fn invalidate(&self, info: &Self::LookupInfo<K>) {
|
||||
self.invalidate_raw(info);
|
||||
@@ -87,30 +87,24 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
|
||||
|
||||
/// Drop an entry from the cache if it's outdated.
|
||||
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
|
||||
fn invalidate_raw(&self, info: &LookupInfo<K>) {
|
||||
let now = Instant::now();
|
||||
|
||||
fn invalidate_raw(&self, key: &K) {
|
||||
// Do costly things before taking the lock.
|
||||
let mut cache = self.cache.lock();
|
||||
let raw_entry = match cache.raw_entry_mut().from_key(&info.key) {
|
||||
let entry = match cache.raw_entry_mut().from_key(key) {
|
||||
RawEntryMut::Vacant(_) => return,
|
||||
RawEntryMut::Occupied(x) => x,
|
||||
RawEntryMut::Occupied(x) => x.remove(),
|
||||
};
|
||||
|
||||
// Remove the entry if it was created prior to lookup timestamp.
|
||||
let entry = raw_entry.get();
|
||||
let (created_at, expires_at) = (entry.created_at, entry.expires_at);
|
||||
let should_remove = created_at <= info.created_at || expires_at <= now;
|
||||
|
||||
if should_remove {
|
||||
raw_entry.remove();
|
||||
}
|
||||
|
||||
drop(cache); // drop lock before logging
|
||||
|
||||
let Entry {
|
||||
created_at,
|
||||
expires_at,
|
||||
..
|
||||
} = entry;
|
||||
|
||||
debug!(
|
||||
created_at = format_args!("{created_at:?}"),
|
||||
expires_at = format_args!("{expires_at:?}"),
|
||||
entry_removed = should_remove,
|
||||
?created_at,
|
||||
?expires_at,
|
||||
"processed a cache entry invalidation event"
|
||||
);
|
||||
}
|
||||
@@ -211,10 +205,10 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
|
||||
}
|
||||
|
||||
pub(crate) fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
|
||||
let (created_at, old) = self.insert_raw(key.clone(), value);
|
||||
let (_, old) = self.insert_raw(key.clone(), value);
|
||||
|
||||
let cached = Cached {
|
||||
token: Some((self, LookupInfo { created_at, key })),
|
||||
token: Some((self, key)),
|
||||
value: (),
|
||||
};
|
||||
|
||||
@@ -229,28 +223,9 @@ impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
|
||||
K: Borrow<Q> + Clone,
|
||||
Q: Hash + Eq + ?Sized,
|
||||
{
|
||||
self.get_raw(key, |key, entry| {
|
||||
let info = LookupInfo {
|
||||
created_at: entry.created_at,
|
||||
key: key.clone(),
|
||||
};
|
||||
|
||||
Cached {
|
||||
token: Some((self, info)),
|
||||
value: entry.value.clone(),
|
||||
}
|
||||
self.get_raw(key, |key, entry| Cached {
|
||||
token: Some((self, key.clone())),
|
||||
value: entry.value.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Lookup information for key invalidation.
|
||||
pub(crate) struct LookupInfo<K> {
|
||||
/// Time of creation of a cache [`Entry`].
|
||||
/// We use this during invalidation lookups to prevent eviction of a newer
|
||||
/// entry sharing the same key (it might've been inserted by a different
|
||||
/// task after we got the entry we're trying to invalidate now).
|
||||
created_at: Instant,
|
||||
|
||||
/// Search by this key.
|
||||
key: K,
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::convert::Infallible;
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
use std::pin::pin;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -98,7 +99,6 @@ impl Pipeline {
|
||||
|
||||
impl CancelKeyOp {
|
||||
fn register(&self, pipe: &mut Pipeline) {
|
||||
#[allow(clippy::used_underscore_binding)]
|
||||
match self {
|
||||
CancelKeyOp::StoreCancelKey { key, value, expire } => {
|
||||
let key = KeyPrefix::Cancel(*key).build_redis_key();
|
||||
@@ -224,6 +224,7 @@ impl CancellationHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// This is not cancel safe
|
||||
async fn get_cancel_key(
|
||||
&self,
|
||||
key: CancelKeyData,
|
||||
@@ -240,16 +241,21 @@ impl CancellationHandler {
|
||||
};
|
||||
|
||||
const TIMEOUT: Duration = Duration::from_secs(5);
|
||||
let result = timeout(TIMEOUT, tx.call((guard, op)))
|
||||
.await
|
||||
.map_err(|_| {
|
||||
tracing::warn!("timed out waiting to receive GetCancelData response");
|
||||
CancelError::RateLimit
|
||||
})?
|
||||
.map_err(|e| {
|
||||
tracing::warn!("failed to receive GetCancelData response: {e}");
|
||||
CancelError::InternalError
|
||||
})?;
|
||||
let result = timeout(
|
||||
TIMEOUT,
|
||||
tx.call((guard, op), std::future::pending::<Infallible>()),
|
||||
)
|
||||
.await
|
||||
.map_err(|_| {
|
||||
tracing::warn!("timed out waiting to receive GetCancelData response");
|
||||
CancelError::RateLimit
|
||||
})?
|
||||
// cannot be cancelled
|
||||
.unwrap_or_else(|x| match x {})
|
||||
.map_err(|e| {
|
||||
tracing::warn!("failed to receive GetCancelData response: {e}");
|
||||
CancelError::InternalError
|
||||
})?;
|
||||
|
||||
let cancel_state_str = String::from_owned_redis_value(result).map_err(|e| {
|
||||
tracing::warn!("failed to receive GetCancelData response: {e}");
|
||||
@@ -271,6 +277,8 @@ impl CancellationHandler {
|
||||
/// Will fetch IP allowlist internally.
|
||||
///
|
||||
/// return Result primarily for tests
|
||||
///
|
||||
/// This is not cancel safe
|
||||
pub(crate) async fn cancel_session<T: ControlPlaneApi>(
|
||||
&self,
|
||||
key: CancelKeyData,
|
||||
@@ -394,6 +402,8 @@ impl Session {
|
||||
|
||||
/// Ensure the cancel key is continously refreshed,
|
||||
/// but stop when the channel is dropped.
|
||||
///
|
||||
/// This is not cancel safe
|
||||
pub(crate) async fn maintain_cancel_key(
|
||||
&self,
|
||||
session_id: uuid::Uuid,
|
||||
@@ -401,27 +411,6 @@ impl Session {
|
||||
cancel_closure: &CancelClosure,
|
||||
compute_config: &ComputeConfig,
|
||||
) {
|
||||
futures::future::select(
|
||||
std::pin::pin!(self.maintain_redis_cancel_key(cancel_closure)),
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
if let Err(err) = cancel_closure
|
||||
.try_cancel_query(compute_config)
|
||||
.boxed()
|
||||
.await
|
||||
{
|
||||
tracing::warn!(
|
||||
?session_id,
|
||||
?err,
|
||||
"could not cancel the query in the database"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure the cancel key is continously refreshed.
|
||||
async fn maintain_redis_cancel_key(&self, cancel_closure: &CancelClosure) -> ! {
|
||||
let Some(tx) = self.cancellation_handler.tx.get() else {
|
||||
tracing::warn!("cancellation handler is not available");
|
||||
// don't exit, as we only want to exit if cancelled externally.
|
||||
@@ -432,6 +421,8 @@ impl Session {
|
||||
.expect("serialising to json string should not fail")
|
||||
.into_boxed_str();
|
||||
|
||||
let mut cancel = pin!(cancel);
|
||||
|
||||
loop {
|
||||
let guard = Metrics::get()
|
||||
.proxy
|
||||
@@ -449,9 +440,35 @@ impl Session {
|
||||
"registering cancellation key"
|
||||
);
|
||||
|
||||
if tx.call((guard, op)).await.is_ok() {
|
||||
tokio::time::sleep(CANCEL_KEY_REFRESH).await;
|
||||
match tx.call((guard, op), cancel.as_mut()).await {
|
||||
Ok(Ok(_)) => {
|
||||
tracing::debug!(
|
||||
src=%self.key,
|
||||
dest=?cancel_closure.cancel_token,
|
||||
"registered cancellation key"
|
||||
);
|
||||
|
||||
// wait before continuing.
|
||||
tokio::time::sleep(CANCEL_KEY_REFRESH).await;
|
||||
}
|
||||
// retry immediately.
|
||||
Ok(Err(error)) => {
|
||||
tracing::warn!(?error, "error registering cancellation key");
|
||||
}
|
||||
Err(Err(_cancelled)) => break,
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(err) = cancel_closure
|
||||
.try_cancel_query(compute_config)
|
||||
.boxed()
|
||||
.await
|
||||
{
|
||||
tracing::warn!(
|
||||
?session_id,
|
||||
?err,
|
||||
"could not cancel the query in the database"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::net::{IpAddr, SocketAddr};
|
||||
|
||||
use futures::{FutureExt, TryFutureExt};
|
||||
use itertools::Itertools;
|
||||
use postgres_client::config::{AuthKeys, SslMode};
|
||||
use postgres_client::config::{AuthKeys, ChannelBinding, SslMode};
|
||||
use postgres_client::maybe_tls_stream::MaybeTlsStream;
|
||||
use postgres_client::tls::MakeTlsConnect;
|
||||
use postgres_client::{NoTls, RawCancelToken, RawConnection};
|
||||
@@ -129,6 +129,8 @@ pub(crate) struct AuthInfo {
|
||||
auth: Option<Auth>,
|
||||
server_params: StartupMessageParams,
|
||||
|
||||
channel_binding: ChannelBinding,
|
||||
|
||||
/// Console redirect sets user and database, we shouldn't re-use those from the params.
|
||||
skip_db_user: bool,
|
||||
}
|
||||
@@ -152,6 +154,8 @@ impl AuthInfo {
|
||||
auth: pw.map(|pw| Auth::Password(pw.as_bytes().to_owned())),
|
||||
server_params,
|
||||
skip_db_user: true,
|
||||
// pg-sni-router is a mitm so this would fail.
|
||||
channel_binding: ChannelBinding::Disable,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -165,6 +169,7 @@ impl AuthInfo {
|
||||
},
|
||||
server_params: StartupMessageParams::default(),
|
||||
skip_db_user: false,
|
||||
channel_binding: ChannelBinding::Prefer,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -187,6 +192,7 @@ impl AuthInfo {
|
||||
Some(Auth::Password(pw)) => config.password(pw),
|
||||
None => &mut config,
|
||||
};
|
||||
config.channel_binding(self.channel_binding);
|
||||
for (k, v) in self.server_params.iter() {
|
||||
config.set_param(k, v);
|
||||
}
|
||||
@@ -230,7 +236,7 @@ impl AuthInfo {
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
compute: &mut ComputeConnection,
|
||||
user_info: ComputeUserInfo,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<PostgresSettings, PostgresError> {
|
||||
// client config with stubbed connect info.
|
||||
// TODO(conrad): should we rewrite this to bypass tokio-postgres2 entirely,
|
||||
@@ -241,7 +247,9 @@ impl AuthInfo {
|
||||
let tmp_config = self.enrich(tmp_config);
|
||||
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let connection = tmp_config.connect_raw(&mut compute.stream, NoTls).await?;
|
||||
let connection = tmp_config
|
||||
.tls_and_authenticate(&mut compute.stream, NoTls)
|
||||
.await?;
|
||||
drop(pause);
|
||||
|
||||
let RawConnection {
|
||||
@@ -264,7 +272,7 @@ impl AuthInfo {
|
||||
secret_key,
|
||||
},
|
||||
compute.hostname.to_string(),
|
||||
user_info,
|
||||
user_info.clone(),
|
||||
);
|
||||
|
||||
Ok(PostgresSettings {
|
||||
|
||||
@@ -22,7 +22,6 @@ pub struct ProxyConfig {
|
||||
pub http_config: HttpConfig,
|
||||
pub authentication_config: AuthenticationConfig,
|
||||
pub proxy_protocol_v2: ProxyProtocolV2,
|
||||
pub region: String,
|
||||
pub handshake_timeout: Duration,
|
||||
pub wake_compute_retry_config: RetryConfig,
|
||||
pub connect_compute_locks: ApiLocks<Host>,
|
||||
|
||||
@@ -11,11 +11,12 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
|
||||
use crate::context::RequestContext;
|
||||
use crate::error::ReportableError;
|
||||
use crate::metrics::{Metrics, NumClientConnectionsGuard};
|
||||
use crate::pglb::ClientRequestError;
|
||||
use crate::pglb::handshake::{HandshakeData, handshake};
|
||||
use crate::pglb::passthrough::ProxyPassthrough;
|
||||
use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol};
|
||||
use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute};
|
||||
use crate::proxy::{ClientRequestError, ErrorSource, prepare_client_connection};
|
||||
use crate::proxy::{ErrorSource, finish_client_init};
|
||||
use crate::util::run_until_cancelled;
|
||||
|
||||
pub async fn task_main(
|
||||
@@ -89,12 +90,7 @@ pub async fn task_main(
|
||||
}
|
||||
}
|
||||
|
||||
let ctx = RequestContext::new(
|
||||
session_id,
|
||||
conn_info,
|
||||
crate::metrics::Protocol::Tcp,
|
||||
&config.region,
|
||||
);
|
||||
let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp);
|
||||
|
||||
let res = handle_client(
|
||||
config,
|
||||
@@ -231,13 +227,13 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
.await?;
|
||||
|
||||
let pg_settings = auth_info
|
||||
.authenticate(ctx, &mut node, user_info)
|
||||
.authenticate(ctx, &mut node, &user_info)
|
||||
.or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) })
|
||||
.await?;
|
||||
|
||||
let session = cancellation_handler.get_key();
|
||||
|
||||
prepare_client_connection(&pg_settings, *session.key(), &mut stream);
|
||||
finish_client_init(&pg_settings, *session.key(), &mut stream);
|
||||
let stream = stream.flush_and_into_inner().await?;
|
||||
|
||||
let session_id = ctx.session_id();
|
||||
|
||||
@@ -46,7 +46,6 @@ struct RequestContextInner {
|
||||
pub(crate) session_id: Uuid,
|
||||
pub(crate) protocol: Protocol,
|
||||
first_packet: chrono::DateTime<Utc>,
|
||||
region: &'static str,
|
||||
pub(crate) span: Span,
|
||||
|
||||
// filled in as they are discovered
|
||||
@@ -94,7 +93,6 @@ impl Clone for RequestContext {
|
||||
session_id: inner.session_id,
|
||||
protocol: inner.protocol,
|
||||
first_packet: inner.first_packet,
|
||||
region: inner.region,
|
||||
span: info_span!("background_task"),
|
||||
|
||||
project: inner.project,
|
||||
@@ -124,12 +122,7 @@ impl Clone for RequestContext {
|
||||
}
|
||||
|
||||
impl RequestContext {
|
||||
pub fn new(
|
||||
session_id: Uuid,
|
||||
conn_info: ConnectionInfo,
|
||||
protocol: Protocol,
|
||||
region: &'static str,
|
||||
) -> Self {
|
||||
pub fn new(session_id: Uuid, conn_info: ConnectionInfo, protocol: Protocol) -> Self {
|
||||
// TODO: be careful with long lived spans
|
||||
let span = info_span!(
|
||||
"connect_request",
|
||||
@@ -145,7 +138,6 @@ impl RequestContext {
|
||||
session_id,
|
||||
protocol,
|
||||
first_packet: Utc::now(),
|
||||
region,
|
||||
span,
|
||||
|
||||
project: None,
|
||||
@@ -179,7 +171,7 @@ impl RequestContext {
|
||||
let ip = IpAddr::from([127, 0, 0, 1]);
|
||||
let addr = SocketAddr::new(ip, 5432);
|
||||
let conn_info = ConnectionInfo { addr, extra: None };
|
||||
RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test")
|
||||
RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp)
|
||||
}
|
||||
|
||||
pub(crate) fn console_application_name(&self) -> String {
|
||||
|
||||
@@ -74,7 +74,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
|
||||
|
||||
#[derive(parquet_derive::ParquetRecordWriter)]
|
||||
pub(crate) struct RequestData {
|
||||
region: &'static str,
|
||||
region: String,
|
||||
protocol: &'static str,
|
||||
/// Must be UTC. The derive macro doesn't like the timezones
|
||||
timestamp: chrono::NaiveDateTime,
|
||||
@@ -147,7 +147,7 @@ impl From<&RequestContextInner> for RequestData {
|
||||
}),
|
||||
jwt_issuer: value.jwt_issuer.clone(),
|
||||
protocol: value.protocol.as_str(),
|
||||
region: value.region,
|
||||
region: String::new(),
|
||||
error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
|
||||
success: value.success,
|
||||
cold_start_info: value.cold_start_info.as_str(),
|
||||
@@ -167,6 +167,7 @@ impl From<&RequestContextInner> for RequestData {
|
||||
pub async fn worker(
|
||||
cancellation_token: CancellationToken,
|
||||
config: ParquetUploadArgs,
|
||||
region: String,
|
||||
) -> anyhow::Result<()> {
|
||||
let Some(remote_storage_config) = config.parquet_upload_remote_storage else {
|
||||
tracing::warn!("parquet request upload: no s3 bucket configured");
|
||||
@@ -232,12 +233,17 @@ pub async fn worker(
|
||||
.context("remote storage for disconnect events init")?;
|
||||
let parquet_config_disconnect = parquet_config.clone();
|
||||
tokio::try_join!(
|
||||
worker_inner(storage, rx, parquet_config),
|
||||
worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
|
||||
worker_inner(storage, rx, parquet_config, ®ion),
|
||||
worker_inner(
|
||||
storage_disconnect,
|
||||
rx_disconnect,
|
||||
parquet_config_disconnect,
|
||||
®ion
|
||||
)
|
||||
)
|
||||
.map(|_| ())
|
||||
} else {
|
||||
worker_inner(storage, rx, parquet_config).await
|
||||
worker_inner(storage, rx, parquet_config, ®ion).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,6 +263,7 @@ async fn worker_inner(
|
||||
storage: GenericRemoteStorage,
|
||||
rx: impl Stream<Item = RequestData>,
|
||||
config: ParquetConfig,
|
||||
region: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
let storage = if config.test_remote_failures > 0 {
|
||||
@@ -277,7 +284,8 @@ async fn worker_inner(
|
||||
let mut last_upload = time::Instant::now();
|
||||
|
||||
let mut len = 0;
|
||||
while let Some(row) = rx.next().await {
|
||||
while let Some(mut row) = rx.next().await {
|
||||
region.clone_into(&mut row.region);
|
||||
rows.push(row);
|
||||
let force = last_upload.elapsed() > config.max_duration;
|
||||
if rows.len() == config.rows_per_group || force {
|
||||
@@ -533,7 +541,7 @@ mod tests {
|
||||
auth_method: None,
|
||||
jwt_issuer: None,
|
||||
protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
|
||||
region: "us-east-1",
|
||||
region: String::new(),
|
||||
error: None,
|
||||
success: rng.r#gen(),
|
||||
cold_start_info: "no",
|
||||
@@ -565,7 +573,9 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
worker_inner(storage, rx, config).await.unwrap();
|
||||
worker_inner(storage, rx, config, "us-east-1")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut files = WalkDir::new(tmpdir.as_std_path())
|
||||
.into_iter()
|
||||
|
||||
@@ -8,10 +8,10 @@ use crate::config::TlsConfig;
|
||||
use crate::context::RequestContext;
|
||||
use crate::error::ReportableError;
|
||||
use crate::metrics::Metrics;
|
||||
use crate::pglb::TlsRequired;
|
||||
use crate::pqproto::{
|
||||
BeMessage, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams,
|
||||
};
|
||||
use crate::proxy::TlsRequired;
|
||||
use crate::stream::{PqStream, Stream, StreamUpgradeError};
|
||||
use crate::tls::PG_ALPN_PROTOCOL;
|
||||
|
||||
|
||||
@@ -2,3 +2,332 @@ pub mod copy_bidirectional;
|
||||
pub mod handshake;
|
||||
pub mod inprocess;
|
||||
pub mod passthrough;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::FutureExt;
|
||||
use smol_str::ToSmolStr;
|
||||
use thiserror::Error;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, debug, error, info, warn};
|
||||
|
||||
use crate::auth;
|
||||
use crate::cancellation::{self, CancellationHandler};
|
||||
use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
|
||||
use crate::context::RequestContext;
|
||||
use crate::error::{ReportableError, UserFacingError};
|
||||
use crate::metrics::{Metrics, NumClientConnectionsGuard};
|
||||
pub use crate::pglb::copy_bidirectional::ErrorSource;
|
||||
use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake};
|
||||
use crate::pglb::passthrough::ProxyPassthrough;
|
||||
use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol};
|
||||
use crate::proxy::handle_client;
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
use crate::stream::Stream;
|
||||
use crate::util::run_until_cancelled;
|
||||
|
||||
pub const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("{ERR_INSECURE_CONNECTION}")]
|
||||
pub struct TlsRequired;
|
||||
|
||||
impl ReportableError for TlsRequired {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for TlsRequired {}
|
||||
|
||||
pub async fn task_main(
|
||||
config: &'static ProxyConfig,
|
||||
auth_backend: &'static auth::Backend<'static, ()>,
|
||||
listener: tokio::net::TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("proxy has shut down");
|
||||
}
|
||||
|
||||
// When set for the server socket, the keepalive setting
|
||||
// will be inherited by all accepted client sockets.
|
||||
socket2::SockRef::from(&listener).set_keepalive(true)?;
|
||||
|
||||
let connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
|
||||
while let Some(accept_result) =
|
||||
run_until_cancelled(listener.accept(), &cancellation_token).await
|
||||
{
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
|
||||
let conn_gauge = Metrics::get()
|
||||
.proxy
|
||||
.client_connections
|
||||
.guard(crate::metrics::Protocol::Tcp);
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let cancellation_handler = Arc::clone(&cancellation_handler);
|
||||
let cancellations = cancellations.clone();
|
||||
|
||||
debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
|
||||
let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
|
||||
|
||||
connections.spawn(async move {
|
||||
let (socket, conn_info) = match config.proxy_protocol_v2 {
|
||||
ProxyProtocolV2::Required => {
|
||||
match read_proxy_protocol(socket).await {
|
||||
Err(e) => {
|
||||
warn!("per-client task finished with an error: {e:#}");
|
||||
return;
|
||||
}
|
||||
// our load balancers will not send any more data. let's just exit immediately
|
||||
Ok((_socket, ConnectHeader::Local)) => {
|
||||
debug!("healthcheck received");
|
||||
return;
|
||||
}
|
||||
Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
|
||||
}
|
||||
}
|
||||
// ignore the header - it cannot be confused for a postgres or http connection so will
|
||||
// error later.
|
||||
ProxyProtocolV2::Rejected => (
|
||||
socket,
|
||||
ConnectionInfo {
|
||||
addr: peer_addr,
|
||||
extra: None,
|
||||
},
|
||||
),
|
||||
};
|
||||
|
||||
match socket.set_nodelay(true) {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"per-client task finished with an error: failed to set socket option: {e:#}"
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp);
|
||||
|
||||
let res = handle_connection(
|
||||
config,
|
||||
auth_backend,
|
||||
&ctx,
|
||||
cancellation_handler,
|
||||
socket,
|
||||
ClientMode::Tcp,
|
||||
endpoint_rate_limiter2,
|
||||
conn_gauge,
|
||||
cancellations,
|
||||
)
|
||||
.instrument(ctx.span())
|
||||
.boxed()
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Err(e) => {
|
||||
ctx.set_error_kind(e.get_error_kind());
|
||||
warn!(parent: &ctx.span(), "per-client task finished with an error: {e:#}");
|
||||
}
|
||||
Ok(None) => {
|
||||
ctx.set_success();
|
||||
}
|
||||
Ok(Some(p)) => {
|
||||
ctx.set_success();
|
||||
let _disconnect = ctx.log_connect();
|
||||
match p.proxy_pass().await {
|
||||
Ok(()) => {}
|
||||
Err(ErrorSource::Client(e)) => {
|
||||
warn!(
|
||||
?session_id,
|
||||
"per-client task finished with an IO error from the client: {e:#}"
|
||||
);
|
||||
}
|
||||
Err(ErrorSource::Compute(e)) => {
|
||||
error!(
|
||||
?session_id,
|
||||
"per-client task finished with an IO error from the compute: {e:#}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
connections.close();
|
||||
cancellations.close();
|
||||
drop(listener);
|
||||
|
||||
// Drain connections
|
||||
connections.wait().await;
|
||||
cancellations.wait().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) enum ClientMode {
|
||||
Tcp,
|
||||
Websockets { hostname: Option<String> },
|
||||
}
|
||||
|
||||
/// Abstracts the logic of handling TCP vs WS clients
|
||||
impl ClientMode {
|
||||
pub fn allow_cleartext(&self) -> bool {
|
||||
match self {
|
||||
ClientMode::Tcp => false,
|
||||
ClientMode::Websockets { .. } => true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn hostname<'a, S>(&'a self, s: &'a Stream<S>) -> Option<&'a str> {
|
||||
match self {
|
||||
ClientMode::Tcp => s.sni_hostname(),
|
||||
ClientMode::Websockets { hostname } => hostname.as_deref(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn handshake_tls<'a>(&self, tls: Option<&'a TlsConfig>) -> Option<&'a TlsConfig> {
|
||||
match self {
|
||||
ClientMode::Tcp => tls,
|
||||
// TLS is None here if using websockets, because the connection is already encrypted.
|
||||
ClientMode::Websockets { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
// almost all errors should be reported to the user, but there's a few cases where we cannot
|
||||
// 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons
|
||||
// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation,
|
||||
// we cannot be sure the client even understands our error message
|
||||
// 3. PrepareClient: The client disconnected, so we can't tell them anyway...
|
||||
pub(crate) enum ClientRequestError {
|
||||
#[error("{0}")]
|
||||
Cancellation(#[from] cancellation::CancelError),
|
||||
#[error("{0}")]
|
||||
Handshake(#[from] HandshakeError),
|
||||
#[error("{0}")]
|
||||
HandshakeTimeout(#[from] tokio::time::error::Elapsed),
|
||||
#[error("{0}")]
|
||||
PrepareClient(#[from] std::io::Error),
|
||||
#[error("{0}")]
|
||||
ReportedError(#[from] crate::stream::ReportedError),
|
||||
}
|
||||
|
||||
impl ReportableError for ClientRequestError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
ClientRequestError::Cancellation(e) => e.get_error_kind(),
|
||||
ClientRequestError::Handshake(e) => e.get_error_kind(),
|
||||
ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit,
|
||||
ClientRequestError::ReportedError(e) => e.get_error_kind(),
|
||||
ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn handle_connection<S: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
config: &'static ProxyConfig,
|
||||
auth_backend: &'static auth::Backend<'static, ()>,
|
||||
ctx: &RequestContext,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
client: S,
|
||||
mode: ClientMode,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
conn_gauge: NumClientConnectionsGuard<'static>,
|
||||
cancellations: tokio_util::task::task_tracker::TaskTracker,
|
||||
) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
|
||||
debug!(
|
||||
protocol = %ctx.protocol(),
|
||||
"handling interactive connection from client"
|
||||
);
|
||||
|
||||
let metrics = &Metrics::get().proxy;
|
||||
let proto = ctx.protocol();
|
||||
let request_gauge = metrics.connection_requests.guard(proto);
|
||||
|
||||
let tls = config.tls_config.load();
|
||||
let tls = tls.as_deref();
|
||||
|
||||
let record_handshake_error = !ctx.has_private_peer_addr();
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(ctx, client, mode.handshake_tls(tls), record_handshake_error);
|
||||
|
||||
let (mut client, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
|
||||
.await??
|
||||
{
|
||||
HandshakeData::Startup(client, params) => (client, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
// spawn a task to cancel the session, but don't wait for it
|
||||
cancellations.spawn({
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let ctx = ctx.clone();
|
||||
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id());
|
||||
cancel_span.follows_from(tracing::Span::current());
|
||||
async move {
|
||||
cancellation_handler_clone
|
||||
.cancel_session(
|
||||
cancel_key_data,
|
||||
ctx,
|
||||
config.authentication_config.ip_allowlist_check_enabled,
|
||||
config.authentication_config.is_vpc_acccess_proxy,
|
||||
auth_backend.get_api(),
|
||||
)
|
||||
.await
|
||||
.inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
|
||||
}.instrument(cancel_span)
|
||||
});
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
drop(pause);
|
||||
|
||||
ctx.set_db_options(params.clone());
|
||||
|
||||
let common_names = tls.map(|tls| &tls.common_names);
|
||||
|
||||
let (node, cancel_on_shutdown) = handle_client(
|
||||
config,
|
||||
auth_backend,
|
||||
ctx,
|
||||
cancellation_handler,
|
||||
&mut client,
|
||||
&mode,
|
||||
endpoint_rate_limiter,
|
||||
common_names,
|
||||
¶ms,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let client = client.flush_and_into_inner().await?;
|
||||
|
||||
let private_link_id = match ctx.extra() {
|
||||
Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()),
|
||||
Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()),
|
||||
None => None,
|
||||
};
|
||||
|
||||
Ok(Some(ProxyPassthrough {
|
||||
client,
|
||||
compute: node.stream,
|
||||
|
||||
aux: node.aux,
|
||||
private_link_id,
|
||||
|
||||
_cancel_on_shutdown: cancel_on_shutdown,
|
||||
|
||||
_req: request_gauge,
|
||||
_conn: conn_gauge,
|
||||
_db_conn: node.guage,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -112,7 +112,7 @@ where
|
||||
let node_info = if !node_info.cached() || !err.should_retry_wake_compute() {
|
||||
// If we just recieved this from cplane and didn't get it from cache, we shouldn't retry.
|
||||
// Do not need to retrieve a new node_info, just return the old one.
|
||||
if should_retry(&err, num_retries, compute.retry) {
|
||||
if !should_retry(&err, num_retries, compute.retry) {
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Failed,
|
||||
|
||||
@@ -5,328 +5,64 @@ pub(crate) mod connect_compute;
|
||||
pub(crate) mod retry;
|
||||
pub(crate) mod wake_compute;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::convert::Infallible;
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::FutureExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use smol_str::{SmolStr, ToSmolStr, format_smolstr};
|
||||
use thiserror::Error;
|
||||
use smol_str::{SmolStr, format_smolstr};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, debug, error, info, warn};
|
||||
use tokio::sync::oneshot;
|
||||
use tracing::Instrument;
|
||||
|
||||
use crate::cancellation::{self, CancellationHandler};
|
||||
use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
|
||||
use crate::cache::Cache;
|
||||
use crate::cancellation::CancellationHandler;
|
||||
use crate::compute::ComputeConnection;
|
||||
use crate::config::ProxyConfig;
|
||||
use crate::context::RequestContext;
|
||||
use crate::error::{ReportableError, UserFacingError};
|
||||
use crate::metrics::{Metrics, NumClientConnectionsGuard};
|
||||
use crate::control_plane::client::ControlPlaneClient;
|
||||
pub use crate::pglb::copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute};
|
||||
use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake};
|
||||
use crate::pglb::passthrough::ProxyPassthrough;
|
||||
use crate::pglb::{ClientMode, ClientRequestError};
|
||||
use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams};
|
||||
use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol};
|
||||
use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute};
|
||||
use crate::proxy::retry::ShouldRetryWakeCompute;
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
use crate::stream::{PqStream, Stream};
|
||||
use crate::types::EndpointCacheKey;
|
||||
use crate::util::run_until_cancelled;
|
||||
use crate::{auth, compute};
|
||||
|
||||
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("{ERR_INSECURE_CONNECTION}")]
|
||||
pub struct TlsRequired;
|
||||
|
||||
impl ReportableError for TlsRequired {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for TlsRequired {}
|
||||
|
||||
pub async fn task_main(
|
||||
config: &'static ProxyConfig,
|
||||
auth_backend: &'static auth::Backend<'static, ()>,
|
||||
listener: tokio::net::TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("proxy has shut down");
|
||||
}
|
||||
|
||||
// When set for the server socket, the keepalive setting
|
||||
// will be inherited by all accepted client sockets.
|
||||
socket2::SockRef::from(&listener).set_keepalive(true)?;
|
||||
|
||||
let connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
|
||||
while let Some(accept_result) =
|
||||
run_until_cancelled(listener.accept(), &cancellation_token).await
|
||||
{
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
|
||||
let conn_gauge = Metrics::get()
|
||||
.proxy
|
||||
.client_connections
|
||||
.guard(crate::metrics::Protocol::Tcp);
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let cancellation_handler = Arc::clone(&cancellation_handler);
|
||||
let cancellations = cancellations.clone();
|
||||
|
||||
debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
|
||||
let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
|
||||
|
||||
connections.spawn(async move {
|
||||
let (socket, conn_info) = match config.proxy_protocol_v2 {
|
||||
ProxyProtocolV2::Required => {
|
||||
match read_proxy_protocol(socket).await {
|
||||
Err(e) => {
|
||||
warn!("per-client task finished with an error: {e:#}");
|
||||
return;
|
||||
}
|
||||
// our load balancers will not send any more data. let's just exit immediately
|
||||
Ok((_socket, ConnectHeader::Local)) => {
|
||||
debug!("healthcheck received");
|
||||
return;
|
||||
}
|
||||
Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
|
||||
}
|
||||
}
|
||||
// ignore the header - it cannot be confused for a postgres or http connection so will
|
||||
// error later.
|
||||
ProxyProtocolV2::Rejected => (
|
||||
socket,
|
||||
ConnectionInfo {
|
||||
addr: peer_addr,
|
||||
extra: None,
|
||||
},
|
||||
),
|
||||
};
|
||||
|
||||
match socket.set_nodelay(true) {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"per-client task finished with an error: failed to set socket option: {e:#}"
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let ctx = RequestContext::new(
|
||||
session_id,
|
||||
conn_info,
|
||||
crate::metrics::Protocol::Tcp,
|
||||
&config.region,
|
||||
);
|
||||
|
||||
let res = handle_client(
|
||||
config,
|
||||
auth_backend,
|
||||
&ctx,
|
||||
cancellation_handler,
|
||||
socket,
|
||||
ClientMode::Tcp,
|
||||
endpoint_rate_limiter2,
|
||||
conn_gauge,
|
||||
cancellations,
|
||||
)
|
||||
.instrument(ctx.span())
|
||||
.boxed()
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Err(e) => {
|
||||
ctx.set_error_kind(e.get_error_kind());
|
||||
warn!(parent: &ctx.span(), "per-client task finished with an error: {e:#}");
|
||||
}
|
||||
Ok(None) => {
|
||||
ctx.set_success();
|
||||
}
|
||||
Ok(Some(p)) => {
|
||||
ctx.set_success();
|
||||
let _disconnect = ctx.log_connect();
|
||||
match p.proxy_pass().await {
|
||||
Ok(()) => {}
|
||||
Err(ErrorSource::Client(e)) => {
|
||||
warn!(
|
||||
?session_id,
|
||||
"per-client task finished with an IO error from the client: {e:#}"
|
||||
);
|
||||
}
|
||||
Err(ErrorSource::Compute(e)) => {
|
||||
error!(
|
||||
?session_id,
|
||||
"per-client task finished with an IO error from the compute: {e:#}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
connections.close();
|
||||
cancellations.close();
|
||||
drop(listener);
|
||||
|
||||
// Drain connections
|
||||
connections.wait().await;
|
||||
cancellations.wait().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) enum ClientMode {
|
||||
Tcp,
|
||||
Websockets { hostname: Option<String> },
|
||||
}
|
||||
|
||||
/// Abstracts the logic of handling TCP vs WS clients
|
||||
impl ClientMode {
|
||||
pub(crate) fn allow_cleartext(&self) -> bool {
|
||||
match self {
|
||||
ClientMode::Tcp => false,
|
||||
ClientMode::Websockets { .. } => true,
|
||||
}
|
||||
}
|
||||
|
||||
fn hostname<'a, S>(&'a self, s: &'a Stream<S>) -> Option<&'a str> {
|
||||
match self {
|
||||
ClientMode::Tcp => s.sni_hostname(),
|
||||
ClientMode::Websockets { hostname } => hostname.as_deref(),
|
||||
}
|
||||
}
|
||||
|
||||
fn handshake_tls<'a>(&self, tls: Option<&'a TlsConfig>) -> Option<&'a TlsConfig> {
|
||||
match self {
|
||||
ClientMode::Tcp => tls,
|
||||
// TLS is None here if using websockets, because the connection is already encrypted.
|
||||
ClientMode::Websockets { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
// almost all errors should be reported to the user, but there's a few cases where we cannot
|
||||
// 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons
|
||||
// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation,
|
||||
// we cannot be sure the client even understands our error message
|
||||
// 3. PrepareClient: The client disconnected, so we can't tell them anyway...
|
||||
pub(crate) enum ClientRequestError {
|
||||
#[error("{0}")]
|
||||
Cancellation(#[from] cancellation::CancelError),
|
||||
#[error("{0}")]
|
||||
Handshake(#[from] HandshakeError),
|
||||
#[error("{0}")]
|
||||
HandshakeTimeout(#[from] tokio::time::error::Elapsed),
|
||||
#[error("{0}")]
|
||||
PrepareClient(#[from] std::io::Error),
|
||||
#[error("{0}")]
|
||||
ReportedError(#[from] crate::stream::ReportedError),
|
||||
}
|
||||
|
||||
impl ReportableError for ClientRequestError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
ClientRequestError::Cancellation(e) => e.get_error_kind(),
|
||||
ClientRequestError::Handshake(e) => e.get_error_kind(),
|
||||
ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit,
|
||||
ClientRequestError::ReportedError(e) => e.get_error_kind(),
|
||||
ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
config: &'static ProxyConfig,
|
||||
auth_backend: &'static auth::Backend<'static, ()>,
|
||||
ctx: &RequestContext,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
stream: S,
|
||||
mode: ClientMode,
|
||||
client: &mut PqStream<Stream<S>>,
|
||||
mode: &ClientMode,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
conn_gauge: NumClientConnectionsGuard<'static>,
|
||||
cancellations: tokio_util::task::task_tracker::TaskTracker,
|
||||
) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
|
||||
debug!(
|
||||
protocol = %ctx.protocol(),
|
||||
"handling interactive connection from client"
|
||||
);
|
||||
|
||||
let metrics = &Metrics::get().proxy;
|
||||
let proto = ctx.protocol();
|
||||
let request_gauge = metrics.connection_requests.guard(proto);
|
||||
|
||||
let tls = config.tls_config.load();
|
||||
let tls = tls.as_deref();
|
||||
|
||||
let record_handshake_error = !ctx.has_private_peer_addr();
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
|
||||
|
||||
let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
|
||||
.await??
|
||||
{
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
// spawn a task to cancel the session, but don't wait for it
|
||||
cancellations.spawn({
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let ctx = ctx.clone();
|
||||
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id());
|
||||
cancel_span.follows_from(tracing::Span::current());
|
||||
async move {
|
||||
cancellation_handler_clone
|
||||
.cancel_session(
|
||||
cancel_key_data,
|
||||
ctx,
|
||||
config.authentication_config.ip_allowlist_check_enabled,
|
||||
config.authentication_config.is_vpc_acccess_proxy,
|
||||
auth_backend.get_api(),
|
||||
)
|
||||
.await
|
||||
.inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
|
||||
}.instrument(cancel_span)
|
||||
});
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
drop(pause);
|
||||
|
||||
ctx.set_db_options(params.clone());
|
||||
|
||||
let hostname = mode.hostname(stream.get_ref());
|
||||
|
||||
let common_names = tls.map(|tls| &tls.common_names);
|
||||
|
||||
common_names: Option<&HashSet<String>>,
|
||||
params: &StartupMessageParams,
|
||||
) -> Result<(ComputeConnection, oneshot::Sender<Infallible>), ClientRequestError> {
|
||||
let hostname = mode.hostname(client.get_ref());
|
||||
// Extract credentials which we're going to use for auth.
|
||||
let result = auth_backend
|
||||
.as_ref()
|
||||
.map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names))
|
||||
.map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, params, hostname, common_names))
|
||||
.transpose();
|
||||
|
||||
let user_info = match result {
|
||||
Ok(user_info) => user_info,
|
||||
Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?,
|
||||
Err(e) => Err(client.throw_error(e, Some(ctx)).await)?,
|
||||
};
|
||||
|
||||
let user = user_info.get_user().to_owned();
|
||||
let user_info = match user_info
|
||||
.authenticate(
|
||||
ctx,
|
||||
&mut stream,
|
||||
client,
|
||||
mode.allow_cleartext(),
|
||||
&config.authentication_config,
|
||||
endpoint_rate_limiter,
|
||||
@@ -339,7 +75,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
let app = params.get("application_name");
|
||||
let params_span = tracing::info_span!("", ?user, ?db, ?app);
|
||||
|
||||
return Err(stream
|
||||
return Err(client
|
||||
.throw_error(e, Some(ctx))
|
||||
.instrument(params_span)
|
||||
.await)?;
|
||||
@@ -352,37 +88,67 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
};
|
||||
let params_compat = creds.info.options.get(NeonOptions::PARAMS_COMPAT).is_some();
|
||||
let mut auth_info = compute::AuthInfo::with_auth_keys(creds.keys);
|
||||
auth_info.set_startup_params(¶ms, params_compat);
|
||||
auth_info.set_startup_params(params, params_compat);
|
||||
|
||||
let res = connect_to_compute(
|
||||
ctx,
|
||||
&TcpMechanism {
|
||||
locks: &config.connect_compute_locks,
|
||||
},
|
||||
&auth::Backend::ControlPlane(cplane, creds.info.clone()),
|
||||
config.wake_compute_retry_config,
|
||||
&config.connect_to_compute,
|
||||
)
|
||||
.await;
|
||||
|
||||
let mut node = match res {
|
||||
Ok(node) => node,
|
||||
Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?,
|
||||
let mut node;
|
||||
let mut attempt = 0;
|
||||
let connect = TcpMechanism {
|
||||
locks: &config.connect_compute_locks,
|
||||
};
|
||||
let backend = auth::Backend::ControlPlane(cplane, creds.info);
|
||||
|
||||
let pg_settings = auth_info.authenticate(ctx, &mut node, creds.info).await;
|
||||
let pg_settings = match pg_settings {
|
||||
Ok(pg_settings) => pg_settings,
|
||||
Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?,
|
||||
// NOTE: This is messy, but should hopefully be detangled with PGLB.
|
||||
// We wanted to separate the concerns of **connect** to compute (a PGLB operation),
|
||||
// from **authenticate** to compute (a NeonKeeper operation).
|
||||
//
|
||||
// This unfortunately removed retry handling for one error case where
|
||||
// the compute was cached, and we connected, but the compute cache was actually stale
|
||||
// and is associated with the wrong endpoint. We detect this when the **authentication** fails.
|
||||
// As such, we retry once here if the `authenticate` function fails and the error is valid to retry.
|
||||
let pg_settings = loop {
|
||||
attempt += 1;
|
||||
|
||||
// TODO: callback to pglb
|
||||
let res = connect_to_compute(
|
||||
ctx,
|
||||
&connect,
|
||||
&backend,
|
||||
config.wake_compute_retry_config,
|
||||
&config.connect_to_compute,
|
||||
)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(n) => node = n,
|
||||
Err(e) => return Err(client.throw_error(e, Some(ctx)).await)?,
|
||||
}
|
||||
|
||||
let auth::Backend::ControlPlane(cplane, user_info) = &backend else {
|
||||
unreachable!("ensured above");
|
||||
};
|
||||
|
||||
let res = auth_info.authenticate(ctx, &mut node, user_info).await;
|
||||
match res {
|
||||
Ok(pg_settings) => break pg_settings,
|
||||
Err(e) if attempt < 2 && e.should_retry_wake_compute() => {
|
||||
tracing::warn!(error = ?e, "retrying wake compute");
|
||||
|
||||
#[allow(irrefutable_let_patterns)]
|
||||
if let ControlPlaneClient::ProxyV1(cplane_proxy_v1) = &**cplane {
|
||||
let key = user_info.endpoint_cache_key();
|
||||
cplane_proxy_v1.caches.node_info.invalidate(&key);
|
||||
}
|
||||
}
|
||||
Err(e) => Err(client.throw_error(e, Some(ctx)).await)?,
|
||||
}
|
||||
};
|
||||
|
||||
let session = cancellation_handler.get_key();
|
||||
|
||||
prepare_client_connection(&pg_settings, *session.key(), &mut stream);
|
||||
let stream = stream.flush_and_into_inner().await?;
|
||||
finish_client_init(&pg_settings, *session.key(), client);
|
||||
|
||||
let session_id = ctx.session_id();
|
||||
let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel();
|
||||
let (cancel_on_shutdown, cancel) = oneshot::channel();
|
||||
tokio::spawn(async move {
|
||||
session
|
||||
.maintain_cancel_key(
|
||||
@@ -394,50 +160,32 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
.await;
|
||||
});
|
||||
|
||||
let private_link_id = match ctx.extra() {
|
||||
Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()),
|
||||
Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()),
|
||||
None => None,
|
||||
};
|
||||
|
||||
Ok(Some(ProxyPassthrough {
|
||||
client: stream,
|
||||
compute: node.stream,
|
||||
|
||||
aux: node.aux,
|
||||
private_link_id,
|
||||
|
||||
_cancel_on_shutdown: cancel_on_shutdown,
|
||||
|
||||
_req: request_gauge,
|
||||
_conn: conn_gauge,
|
||||
_db_conn: node.guage,
|
||||
}))
|
||||
Ok((node, cancel_on_shutdown))
|
||||
}
|
||||
|
||||
/// Finish client connection initialization: confirm auth success, send params, etc.
|
||||
pub(crate) fn prepare_client_connection(
|
||||
pub(crate) fn finish_client_init(
|
||||
settings: &compute::PostgresSettings,
|
||||
cancel_key_data: CancelKeyData,
|
||||
stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) {
|
||||
// Forward all deferred notices to the client.
|
||||
for notice in &settings.delayed_notice {
|
||||
stream.write_raw(notice.as_bytes().len(), b'N', |buf| {
|
||||
client.write_raw(notice.as_bytes().len(), b'N', |buf| {
|
||||
buf.extend_from_slice(notice.as_bytes());
|
||||
});
|
||||
}
|
||||
|
||||
// Forward all postgres connection params to the client.
|
||||
for (name, value) in &settings.params {
|
||||
stream.write_message(BeMessage::ParameterStatus {
|
||||
client.write_message(BeMessage::ParameterStatus {
|
||||
name: name.as_bytes(),
|
||||
value: value.as_bytes(),
|
||||
});
|
||||
}
|
||||
|
||||
stream.write_message(BeMessage::BackendKeyData(cancel_key_data));
|
||||
stream.write_message(BeMessage::ReadyForQuery);
|
||||
client.write_message(BeMessage::BackendKeyData(cancel_key_data));
|
||||
client.write_message(BeMessage::ReadyForQuery);
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
|
||||
@@ -447,7 +195,7 @@ impl NeonOptions {
|
||||
// proxy options:
|
||||
|
||||
/// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute.
|
||||
const PARAMS_COMPAT: &str = "proxy_params_compat";
|
||||
pub const PARAMS_COMPAT: &str = "proxy_params_compat";
|
||||
|
||||
// cplane options:
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::io;
|
||||
|
||||
use tokio::time;
|
||||
|
||||
use crate::compute;
|
||||
use crate::compute::{self, PostgresError};
|
||||
use crate::config::RetryConfig;
|
||||
|
||||
pub(crate) trait CouldRetry {
|
||||
@@ -115,6 +115,14 @@ impl ShouldRetryWakeCompute for compute::ConnectionError {
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetryWakeCompute for PostgresError {
|
||||
fn should_retry_wake_compute(&self) -> bool {
|
||||
match self {
|
||||
PostgresError::Postgres(error) => error.should_retry_wake_compute(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
|
||||
config
|
||||
.base_delay
|
||||
|
||||
@@ -14,6 +14,9 @@ use tokio::io::{AsyncReadExt, AsyncWriteExt, DuplexStream};
|
||||
use tokio_util::codec::{Decoder, Encoder};
|
||||
|
||||
use super::*;
|
||||
use crate::config::TlsConfig;
|
||||
use crate::context::RequestContext;
|
||||
use crate::pglb::handshake::{HandshakeData, handshake};
|
||||
|
||||
enum Intercept {
|
||||
None,
|
||||
@@ -169,7 +172,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
|
||||
.dbname("db")
|
||||
.password("password")
|
||||
.ssl_mode(SslMode::Require)
|
||||
.connect_raw(server, client_config.make_tls_connect()?)
|
||||
.tls_and_authenticate(server, client_config.make_tls_connect()?)
|
||||
.await?;
|
||||
|
||||
proxy.await?
|
||||
@@ -252,7 +255,7 @@ async fn connect_failure(
|
||||
.dbname("db")
|
||||
.password("password")
|
||||
.ssl_mode(SslMode::Require)
|
||||
.connect_raw(server, client_config.make_tls_connect()?)
|
||||
.tls_and_authenticate(server, client_config.make_tls_connect()?)
|
||||
.await
|
||||
.err()
|
||||
.context("client shouldn't be able to connect")?;
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
mod mitm;
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
@@ -10,26 +11,31 @@ use async_trait::async_trait;
|
||||
use http::StatusCode;
|
||||
use postgres_client::config::SslMode;
|
||||
use postgres_client::tls::{MakeTlsConnect, NoTls};
|
||||
use retry::{ShouldRetryWakeCompute, retry_after};
|
||||
use rstest::rstest;
|
||||
use rustls::crypto::ring;
|
||||
use rustls::pki_types;
|
||||
use tokio::io::DuplexStream;
|
||||
use tokio::io::{AsyncRead, AsyncWrite, DuplexStream};
|
||||
use tracing_test::traced_test;
|
||||
|
||||
use super::retry::CouldRetry;
|
||||
use super::*;
|
||||
use crate::auth::backend::{ComputeUserInfo, MaybeOwned};
|
||||
use crate::config::{ComputeConfig, RetryConfig};
|
||||
use crate::config::{ComputeConfig, RetryConfig, TlsConfig};
|
||||
use crate::context::RequestContext;
|
||||
use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
|
||||
use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
|
||||
use crate::control_plane::{self, CachedNodeInfo, NodeInfo, NodeInfoCache};
|
||||
use crate::error::ErrorKind;
|
||||
use crate::proxy::connect_compute::ConnectMechanism;
|
||||
use crate::error::{ErrorKind, ReportableError};
|
||||
use crate::pglb::ERR_INSECURE_CONNECTION;
|
||||
use crate::pglb::handshake::{HandshakeData, handshake};
|
||||
use crate::pqproto::BeMessage;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::proxy::connect_compute::{ConnectMechanism, connect_to_compute};
|
||||
use crate::proxy::retry::{ShouldRetryWakeCompute, retry_after};
|
||||
use crate::stream::{PqStream, Stream};
|
||||
use crate::tls::client_config::compute_client_config_with_certs;
|
||||
use crate::tls::server_config::CertResolver;
|
||||
use crate::types::{BranchId, EndpointId, ProjectId};
|
||||
use crate::{sasl, scram};
|
||||
use crate::{auth, compute, sasl, scram};
|
||||
|
||||
/// Generate a set of TLS certificates: CA + server.
|
||||
fn generate_certs(
|
||||
@@ -199,7 +205,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
|
||||
.user("john_doe")
|
||||
.dbname("earth")
|
||||
.ssl_mode(SslMode::Disable)
|
||||
.connect_raw(server, NoTls)
|
||||
.tls_and_authenticate(server, NoTls)
|
||||
.await
|
||||
.err() // -> Option<E>
|
||||
.context("client shouldn't be able to connect")?;
|
||||
@@ -228,7 +234,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
|
||||
.user("john_doe")
|
||||
.dbname("earth")
|
||||
.ssl_mode(SslMode::Require)
|
||||
.connect_raw(server, client_config.make_tls_connect()?)
|
||||
.tls_and_authenticate(server, client_config.make_tls_connect()?)
|
||||
.await?;
|
||||
|
||||
proxy.await?
|
||||
@@ -245,7 +251,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
|
||||
.dbname("earth")
|
||||
.set_param("options", "project=generic-project-name")
|
||||
.ssl_mode(SslMode::Prefer)
|
||||
.connect_raw(server, NoTls)
|
||||
.tls_and_authenticate(server, NoTls)
|
||||
.await?;
|
||||
|
||||
proxy.await?
|
||||
@@ -293,7 +299,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
|
||||
.dbname("db")
|
||||
.password(password)
|
||||
.ssl_mode(SslMode::Require)
|
||||
.connect_raw(server, client_config.make_tls_connect()?)
|
||||
.tls_and_authenticate(server, client_config.make_tls_connect()?)
|
||||
.await?;
|
||||
|
||||
proxy.await?
|
||||
@@ -317,7 +323,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
|
||||
.dbname("db")
|
||||
.password("password")
|
||||
.ssl_mode(SslMode::Require)
|
||||
.connect_raw(server, client_config.make_tls_connect()?)
|
||||
.tls_and_authenticate(server, client_config.make_tls_connect()?)
|
||||
.await?;
|
||||
|
||||
proxy.await?
|
||||
@@ -344,7 +350,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
|
||||
.dbname("db")
|
||||
.password(&password) // no password will match the mocked secret
|
||||
.ssl_mode(SslMode::Require)
|
||||
.connect_raw(server, client_config.make_tls_connect()?)
|
||||
.tls_and_authenticate(server, client_config.make_tls_connect()?)
|
||||
.await
|
||||
.err() // -> Option<E>
|
||||
.context("client shouldn't be able to connect")?;
|
||||
@@ -374,6 +380,7 @@ fn connect_compute_total_wait() {
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
enum ConnectAction {
|
||||
Wake,
|
||||
WakeCold,
|
||||
WakeFail,
|
||||
WakeRetry,
|
||||
Connect,
|
||||
@@ -504,6 +511,9 @@ impl TestControlPlaneClient for TestConnectMechanism {
|
||||
*counter += 1;
|
||||
match action {
|
||||
ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
|
||||
ConnectAction::WakeCold => Ok(CachedNodeInfo::new_uncached(
|
||||
helper_create_uncached_node_info(),
|
||||
)),
|
||||
ConnectAction::WakeFail => {
|
||||
let err = control_plane::errors::ControlPlaneError::Message(Box::new(
|
||||
ControlPlaneErrorMessage {
|
||||
@@ -551,8 +561,8 @@ impl TestControlPlaneClient for TestConnectMechanism {
|
||||
}
|
||||
}
|
||||
|
||||
fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
|
||||
let node = NodeInfo {
|
||||
fn helper_create_uncached_node_info() -> NodeInfo {
|
||||
NodeInfo {
|
||||
conn_info: compute::ConnectInfo {
|
||||
host: "test".into(),
|
||||
port: 5432,
|
||||
@@ -566,7 +576,11 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
|
||||
compute_id: "compute".into(),
|
||||
cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
|
||||
let node = helper_create_uncached_node_info();
|
||||
let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone()));
|
||||
node2.map(|()| node)
|
||||
}
|
||||
@@ -742,7 +756,7 @@ async fn fail_no_wake_skips_cache_invalidation() {
|
||||
let ctx = RequestContext::test();
|
||||
let mech = TestConnectMechanism::new(vec![
|
||||
ConnectAction::Wake,
|
||||
ConnectAction::FailNoWake,
|
||||
ConnectAction::RetryNoWake,
|
||||
ConnectAction::Connect,
|
||||
]);
|
||||
let user = helper_create_connect_info(&mech);
|
||||
@@ -788,7 +802,7 @@ async fn retry_no_wake_skips_invalidation() {
|
||||
|
||||
let ctx = RequestContext::test();
|
||||
// Wake → RetryNoWake (retryable + NOT wakeable)
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake]);
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake, Fail]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
let cfg = config();
|
||||
|
||||
@@ -802,3 +816,44 @@ async fn retry_no_wake_skips_invalidation() {
|
||||
"invalidating stalled compute node info cache entry"
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[traced_test]
|
||||
async fn retry_no_wake_error_fast() {
|
||||
let _ = env_logger::try_init();
|
||||
use ConnectAction::*;
|
||||
|
||||
let ctx = RequestContext::test();
|
||||
// Wake → FailNoWake (not retryable + NOT wakeable)
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, FailNoWake]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
let cfg = config();
|
||||
|
||||
connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
|
||||
.await
|
||||
.unwrap_err();
|
||||
mechanism.verify();
|
||||
|
||||
// Because FailNoWake has wakeable=false, we must NOT see invalidate_cache
|
||||
assert!(!logs_contain(
|
||||
"invalidating stalled compute node info cache entry"
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[traced_test]
|
||||
async fn retry_cold_wake_skips_invalidation() {
|
||||
let _ = env_logger::try_init();
|
||||
use ConnectAction::*;
|
||||
|
||||
let ctx = RequestContext::test();
|
||||
// WakeCold → FailNoWake (not retryable + NOT wakeable)
|
||||
let mechanism = TestConnectMechanism::new(vec![WakeCold, Retry, Connect]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
let cfg = config();
|
||||
|
||||
connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
@@ -139,12 +139,6 @@ impl RateBucketInfo {
|
||||
Self::new(200, Duration::from_secs(600)),
|
||||
];
|
||||
|
||||
// For all the sessions will be cancel key. So this limit is essentially global proxy limit.
|
||||
pub const DEFAULT_REDIS_SET: [Self; 2] = [
|
||||
Self::new(100_000, Duration::from_secs(1)),
|
||||
Self::new(50_000, Duration::from_secs(10)),
|
||||
];
|
||||
|
||||
pub fn rps(&self) -> f64 {
|
||||
(self.max_rpi as f64) / self.interval.as_secs_f64()
|
||||
}
|
||||
|
||||
@@ -23,9 +23,8 @@ impl KeyPrefix {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::pqproto::id_to_cancel_key;
|
||||
|
||||
use super::*;
|
||||
use crate::pqproto::id_to_cancel_key;
|
||||
|
||||
#[test]
|
||||
fn test_build_redis_key() {
|
||||
|
||||
@@ -5,11 +5,9 @@ use redis::aio::ConnectionLike;
|
||||
use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};
|
||||
|
||||
use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
|
||||
|
||||
pub struct RedisKVClient {
|
||||
client: ConnectionWithCredentialsProvider,
|
||||
limiter: GlobalRateLimiter,
|
||||
}
|
||||
|
||||
#[allow(async_fn_in_trait)]
|
||||
@@ -30,11 +28,8 @@ impl Queryable for Cmd {
|
||||
}
|
||||
|
||||
impl RedisKVClient {
|
||||
pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self {
|
||||
Self {
|
||||
client,
|
||||
limiter: GlobalRateLimiter::new(info.into()),
|
||||
}
|
||||
pub fn new(client: ConnectionWithCredentialsProvider) -> Self {
|
||||
Self { client }
|
||||
}
|
||||
|
||||
pub async fn try_connect(&mut self) -> anyhow::Result<()> {
|
||||
@@ -49,11 +44,6 @@ impl RedisKVClient {
|
||||
&mut self,
|
||||
q: &impl Queryable,
|
||||
) -> anyhow::Result<T> {
|
||||
if !self.limiter.check() {
|
||||
tracing::info!("Rate limit exceeded. Skipping query");
|
||||
return Err(anyhow::anyhow!("Rate limit exceeded"));
|
||||
}
|
||||
|
||||
let e = match q.query(&mut self.client).await {
|
||||
Ok(t) => return Ok(t),
|
||||
Err(e) => e,
|
||||
|
||||
@@ -141,29 +141,19 @@ where
|
||||
|
||||
struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
|
||||
cache: Arc<C>,
|
||||
region_id: String,
|
||||
}
|
||||
|
||||
impl<C: ProjectInfoCache + Send + Sync + 'static> Clone for MessageHandler<C> {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
cache: self.cache.clone(),
|
||||
region_id: self.region_id.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
pub(crate) fn new(cache: Arc<C>, region_id: String) -> Self {
|
||||
Self { cache, region_id }
|
||||
}
|
||||
|
||||
pub(crate) async fn increment_active_listeners(&self) {
|
||||
self.cache.increment_active_listeners().await;
|
||||
}
|
||||
|
||||
pub(crate) async fn decrement_active_listeners(&self) {
|
||||
self.cache.decrement_active_listeners().await;
|
||||
pub(crate) fn new(cache: Arc<C>) -> Self {
|
||||
Self { cache }
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
|
||||
@@ -276,7 +266,7 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
|
||||
}
|
||||
let mut conn = match try_connect(&redis).await {
|
||||
Ok(conn) => {
|
||||
handler.increment_active_listeners().await;
|
||||
handler.cache.increment_active_listeners().await;
|
||||
conn
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -297,11 +287,11 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
|
||||
}
|
||||
}
|
||||
if cancellation_token.is_cancelled() {
|
||||
handler.decrement_active_listeners().await;
|
||||
handler.cache.decrement_active_listeners().await;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
handler.decrement_active_listeners().await;
|
||||
handler.cache.decrement_active_listeners().await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -310,12 +300,11 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
|
||||
pub async fn task_main<C>(
|
||||
redis: ConnectionWithCredentialsProvider,
|
||||
cache: Arc<C>,
|
||||
region_id: String,
|
||||
) -> anyhow::Result<Infallible>
|
||||
where
|
||||
C: ProjectInfoCache + Send + Sync + 'static,
|
||||
{
|
||||
let handler = MessageHandler::new(cache, region_id);
|
||||
let handler = MessageHandler::new(cache);
|
||||
// 6h - 1m.
|
||||
// There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost.
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60));
|
||||
|
||||
@@ -417,12 +417,7 @@ async fn request_handler(
|
||||
if config.http_config.accept_websockets
|
||||
&& framed_websockets::upgrade::is_upgrade_request(&request)
|
||||
{
|
||||
let ctx = RequestContext::new(
|
||||
session_id,
|
||||
conn_info,
|
||||
crate::metrics::Protocol::Ws,
|
||||
&config.region,
|
||||
);
|
||||
let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Ws);
|
||||
|
||||
ctx.set_user_agent(
|
||||
request
|
||||
@@ -462,12 +457,7 @@ async fn request_handler(
|
||||
// Return the response so the spawned future can continue.
|
||||
Ok(response.map(|b| b.map_err(|x| match x {}).boxed()))
|
||||
} else if request.uri().path() == "/sql" && *request.method() == Method::POST {
|
||||
let ctx = RequestContext::new(
|
||||
session_id,
|
||||
conn_info,
|
||||
crate::metrics::Protocol::Http,
|
||||
&config.region,
|
||||
);
|
||||
let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Http);
|
||||
let span = ctx.span();
|
||||
|
||||
let testodrome_id = request
|
||||
|
||||
@@ -17,7 +17,8 @@ use crate::config::ProxyConfig;
|
||||
use crate::context::RequestContext;
|
||||
use crate::error::ReportableError;
|
||||
use crate::metrics::Metrics;
|
||||
use crate::proxy::{ClientMode, ErrorSource, handle_client};
|
||||
use crate::pglb::{ClientMode, handle_connection};
|
||||
use crate::proxy::ErrorSource;
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
|
||||
pin_project! {
|
||||
@@ -142,7 +143,7 @@ pub(crate) async fn serve_websocket(
|
||||
.client_connections
|
||||
.guard(crate::metrics::Protocol::Ws);
|
||||
|
||||
let res = Box::pin(handle_client(
|
||||
let res = Box::pin(handle_connection(
|
||||
config,
|
||||
auth_backend,
|
||||
&ctx,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.87.0"
|
||||
channel = "1.88.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -220,7 +220,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
stripe_size: ShardStripeSize(stripe_size),
|
||||
};
|
||||
self.shard =
|
||||
Some(ShardIdentity::from_params(ShardNumber(number), ¶ms));
|
||||
Some(ShardIdentity::from_params(ShardNumber(number), params));
|
||||
}
|
||||
_ => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::cmp::min;
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::RangeInclusive;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
@@ -34,7 +35,7 @@ use crate::control_file::CONTROL_FILE_NAME;
|
||||
use crate::state::{EvictionState, TimelinePersistentState};
|
||||
use crate::timeline::{Timeline, TimelineError, WalResidentTimeline};
|
||||
use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline};
|
||||
use crate::wal_storage::open_wal_file;
|
||||
use crate::wal_storage::{open_wal_file, wal_file_paths};
|
||||
use crate::{GlobalTimelines, debug_dump, wal_backup};
|
||||
|
||||
/// Stream tar archive of timeline to tx.
|
||||
@@ -95,8 +96,8 @@ pub async fn stream_snapshot(
|
||||
|
||||
/// State needed while streaming the snapshot.
|
||||
pub struct SnapshotContext {
|
||||
pub from_segno: XLogSegNo, // including
|
||||
pub upto_segno: XLogSegNo, // including
|
||||
/// The interval of segment numbers. If None, the timeline hasn't had writes yet, so only send the control file
|
||||
pub from_to_segno: Option<RangeInclusive<XLogSegNo>>,
|
||||
pub term: Term,
|
||||
pub last_log_term: Term,
|
||||
pub flush_lsn: Lsn,
|
||||
@@ -174,23 +175,35 @@ pub async fn stream_snapshot_resident_guts(
|
||||
.await?;
|
||||
pausable_failpoint!("sk-snapshot-after-list-pausable");
|
||||
|
||||
let tli_dir = tli.get_timeline_dir();
|
||||
info!(
|
||||
"sending {} segments [{:#X}-{:#X}], term={}, last_log_term={}, flush_lsn={}",
|
||||
bctx.upto_segno - bctx.from_segno + 1,
|
||||
bctx.from_segno,
|
||||
bctx.upto_segno,
|
||||
bctx.term,
|
||||
bctx.last_log_term,
|
||||
bctx.flush_lsn,
|
||||
);
|
||||
for segno in bctx.from_segno..=bctx.upto_segno {
|
||||
let (mut sf, is_partial) = open_wal_file(&tli_dir, segno, bctx.wal_seg_size).await?;
|
||||
let mut wal_file_name = XLogFileName(PG_TLI, segno, bctx.wal_seg_size);
|
||||
if is_partial {
|
||||
wal_file_name.push_str(".partial");
|
||||
if let Some(from_to_segno) = &bctx.from_to_segno {
|
||||
let tli_dir = tli.get_timeline_dir();
|
||||
info!(
|
||||
"sending {} segments [{:#X}-{:#X}], term={}, last_log_term={}, flush_lsn={}",
|
||||
from_to_segno.end() - from_to_segno.start() + 1,
|
||||
from_to_segno.start(),
|
||||
from_to_segno.end(),
|
||||
bctx.term,
|
||||
bctx.last_log_term,
|
||||
bctx.flush_lsn,
|
||||
);
|
||||
for segno in from_to_segno.clone() {
|
||||
let Some((mut sf, is_partial)) =
|
||||
open_wal_file(&tli_dir, segno, bctx.wal_seg_size).await?
|
||||
else {
|
||||
// File is not found
|
||||
let (wal_file_path, _wal_file_partial_path) =
|
||||
wal_file_paths(&tli_dir, segno, bctx.wal_seg_size);
|
||||
tracing::warn!("couldn't find WAL segment file {wal_file_path}");
|
||||
bail!("couldn't find WAL segment file {wal_file_path}")
|
||||
};
|
||||
let mut wal_file_name = XLogFileName(PG_TLI, segno, bctx.wal_seg_size);
|
||||
if is_partial {
|
||||
wal_file_name.push_str(".partial");
|
||||
}
|
||||
ar.append_file(&wal_file_name, &mut sf).await?;
|
||||
}
|
||||
ar.append_file(&wal_file_name, &mut sf).await?;
|
||||
} else {
|
||||
info!("Not including any segments into the snapshot");
|
||||
}
|
||||
|
||||
// Do the term check before ar.finish to make archive corrupted in case of
|
||||
@@ -338,19 +351,26 @@ impl WalResidentTimeline {
|
||||
// removed further than `backup_lsn`. Since we're holding shared_state
|
||||
// lock and setting `wal_removal_on_hold` later, it guarantees that WAL
|
||||
// won't be removed until we're done.
|
||||
let timeline_state = shared_state.sk.state();
|
||||
let from_lsn = min(
|
||||
shared_state.sk.state().remote_consistent_lsn,
|
||||
shared_state.sk.state().backup_lsn,
|
||||
timeline_state.remote_consistent_lsn,
|
||||
timeline_state.backup_lsn,
|
||||
);
|
||||
let flush_lsn = shared_state.sk.flush_lsn();
|
||||
let (send_segments, msg) = if from_lsn == Lsn::INVALID {
|
||||
(false, "snapshot is called on uninitialized timeline")
|
||||
} else {
|
||||
(true, "timeline is initialized")
|
||||
};
|
||||
tracing::info!(
|
||||
remote_consistent_lsn=%timeline_state.remote_consistent_lsn,
|
||||
backup_lsn=%timeline_state.backup_lsn,
|
||||
%flush_lsn,
|
||||
"{msg}"
|
||||
);
|
||||
if from_lsn == Lsn::INVALID {
|
||||
// this is possible if snapshot is called before handling first
|
||||
// elected message
|
||||
bail!("snapshot is called on uninitialized timeline");
|
||||
}
|
||||
let from_segno = from_lsn.segment_number(wal_seg_size);
|
||||
let term = shared_state.sk.state().acceptor_state.term;
|
||||
let last_log_term = shared_state.sk.last_log_term();
|
||||
let flush_lsn = shared_state.sk.flush_lsn();
|
||||
let upto_segno = flush_lsn.segment_number(wal_seg_size);
|
||||
// have some limit on max number of segments as a sanity check
|
||||
const MAX_ALLOWED_SEGS: u64 = 1000;
|
||||
@@ -376,9 +396,9 @@ impl WalResidentTimeline {
|
||||
drop(shared_state);
|
||||
|
||||
let tli_copy = self.wal_residence_guard().await?;
|
||||
let from_to_segno = send_segments.then_some(from_segno..=upto_segno);
|
||||
let bctx = SnapshotContext {
|
||||
from_segno,
|
||||
upto_segno,
|
||||
from_to_segno,
|
||||
term,
|
||||
last_log_term,
|
||||
flush_lsn,
|
||||
|
||||
@@ -9,7 +9,7 @@ use anyhow::{Result, bail};
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use postgres_versioninfo::{PgMajorVersion, PgVersionId};
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse};
|
||||
use safekeeper_api::models::TimelineTermBumpResponse;
|
||||
use safekeeper_api::{INITIAL_TERM, ServerInfo, Term};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
@@ -83,6 +83,11 @@ pub enum EvictionState {
|
||||
Offloaded(Lsn),
|
||||
}
|
||||
|
||||
pub struct MembershipSwitchResult {
|
||||
pub previous_conf: Configuration,
|
||||
pub current_conf: Configuration,
|
||||
}
|
||||
|
||||
impl TimelinePersistentState {
|
||||
/// commit_lsn is the same as start_lsn in the normal creaiton; see
|
||||
/// `TimelineCreateRequest` comments.`
|
||||
@@ -261,10 +266,7 @@ where
|
||||
|
||||
/// Switch into membership configuration `to` if it is higher than the
|
||||
/// current one.
|
||||
pub async fn membership_switch(
|
||||
&mut self,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
pub async fn membership_switch(&mut self, to: Configuration) -> Result<MembershipSwitchResult> {
|
||||
let before = self.mconf.clone();
|
||||
// Is switch allowed?
|
||||
if to.generation <= self.mconf.generation {
|
||||
@@ -278,7 +280,7 @@ where
|
||||
self.finish_change(&state).await?;
|
||||
info!("switched membership conf to {} from {}", to, before);
|
||||
}
|
||||
Ok(TimelineMembershipSwitchResponse {
|
||||
Ok(MembershipSwitchResult {
|
||||
previous_conf: before,
|
||||
current_conf: self.mconf.clone(),
|
||||
})
|
||||
|
||||
@@ -190,7 +190,14 @@ impl StateSK {
|
||||
&mut self,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
self.state_mut().membership_switch(to).await
|
||||
let result = self.state_mut().membership_switch(to).await?;
|
||||
|
||||
Ok(TimelineMembershipSwitchResponse {
|
||||
previous_conf: result.previous_conf,
|
||||
current_conf: result.current_conf,
|
||||
term: self.state().acceptor_state.term,
|
||||
flush_lsn: self.flush_lsn(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Close open WAL files to release FDs.
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
use std::cmp::{max, min};
|
||||
use std::future::Future;
|
||||
use std::io::{self, SeekFrom};
|
||||
use std::io::{ErrorKind, SeekFrom};
|
||||
use std::pin::Pin;
|
||||
|
||||
use anyhow::{Context, Result, bail};
|
||||
@@ -154,8 +154,8 @@ pub struct PhysicalStorage {
|
||||
/// record
|
||||
///
|
||||
/// Partial segment 002 has no WAL records, and it will be removed by the
|
||||
/// next truncate_wal(). This flag will be set to true after the first
|
||||
/// truncate_wal() call.
|
||||
/// next truncate_wal(). This flag will be set to false after the first
|
||||
/// successful truncate_wal() call.
|
||||
///
|
||||
/// [`write_lsn`]: Self::write_lsn
|
||||
pending_wal_truncation: bool,
|
||||
@@ -202,6 +202,8 @@ impl PhysicalStorage {
|
||||
ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn,
|
||||
);
|
||||
if flush_lsn < state.commit_lsn {
|
||||
// note: can never happen. find_end_of_wal returns provided start_lsn
|
||||
// (state.commit_lsn in our case) if it doesn't find anything.
|
||||
bail!(
|
||||
"timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file",
|
||||
ttid.timeline_id,
|
||||
@@ -794,26 +796,13 @@ impl WalReader {
|
||||
|
||||
// Try to open local file, if we may have WAL locally
|
||||
if self.pos >= self.local_start_lsn {
|
||||
let res = open_wal_file(&self.timeline_dir, segno, self.wal_seg_size).await;
|
||||
match res {
|
||||
Ok((mut file, _)) => {
|
||||
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
|
||||
return Ok(Box::pin(file));
|
||||
}
|
||||
Err(e) => {
|
||||
let is_not_found = e.chain().any(|e| {
|
||||
if let Some(e) = e.downcast_ref::<io::Error>() {
|
||||
e.kind() == io::ErrorKind::NotFound
|
||||
} else {
|
||||
false
|
||||
}
|
||||
});
|
||||
if !is_not_found {
|
||||
return Err(e);
|
||||
}
|
||||
// NotFound is expected, fall through to remote read
|
||||
}
|
||||
};
|
||||
let res = open_wal_file(&self.timeline_dir, segno, self.wal_seg_size).await?;
|
||||
if let Some((mut file, _)) = res {
|
||||
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
|
||||
return Ok(Box::pin(file));
|
||||
} else {
|
||||
// NotFound is expected, fall through to remote read
|
||||
}
|
||||
}
|
||||
|
||||
// Try to open remote file, if remote reads are enabled
|
||||
@@ -832,26 +821,31 @@ pub(crate) async fn open_wal_file(
|
||||
timeline_dir: &Utf8Path,
|
||||
segno: XLogSegNo,
|
||||
wal_seg_size: usize,
|
||||
) -> Result<(tokio::fs::File, bool)> {
|
||||
) -> Result<Option<(tokio::fs::File, bool)>> {
|
||||
let (wal_file_path, wal_file_partial_path) = wal_file_paths(timeline_dir, segno, wal_seg_size);
|
||||
|
||||
// First try to open the .partial file.
|
||||
let mut partial_path = wal_file_path.to_owned();
|
||||
partial_path.set_extension("partial");
|
||||
if let Ok(opened_file) = tokio::fs::File::open(&wal_file_partial_path).await {
|
||||
return Ok((opened_file, true));
|
||||
return Ok(Some((opened_file, true)));
|
||||
}
|
||||
|
||||
// If that failed, try it without the .partial extension.
|
||||
let pf = tokio::fs::File::open(&wal_file_path)
|
||||
.await
|
||||
let pf_res = tokio::fs::File::open(&wal_file_path).await;
|
||||
if let Err(e) = &pf_res {
|
||||
if e.kind() == ErrorKind::NotFound {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
let pf = pf_res
|
||||
.with_context(|| format!("failed to open WAL file {wal_file_path:#}"))
|
||||
.map_err(|e| {
|
||||
warn!("{}", e);
|
||||
warn!("{e}");
|
||||
e
|
||||
})?;
|
||||
|
||||
Ok((pf, false))
|
||||
Ok(Some((pf, false)))
|
||||
}
|
||||
|
||||
/// Helper returning full path to WAL segment file and its .partial brother.
|
||||
|
||||
@@ -20,6 +20,7 @@ camino.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
clashmap.workspace = true
|
||||
compute_api.workspace = true
|
||||
cron.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
@@ -27,6 +28,7 @@ governor.workspace = true
|
||||
hex.workspace = true
|
||||
hyper0.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
itertools.workspace = true
|
||||
json-structural-diff.workspace = true
|
||||
lasso.workspace = true
|
||||
@@ -34,6 +36,7 @@ once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
posthog_client_lite.workspace = true
|
||||
rand.workspace = true
|
||||
reqwest = { workspace = true, features = ["stream"] }
|
||||
routerify.workspace = true
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user