mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-08 13:10:37 +00:00
Compare commits
415 Commits
hackathon/
...
release-59
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d74fb7b879 | ||
|
|
7973c3e941 | ||
|
|
085bbaf5f8 | ||
|
|
85b5219861 | ||
|
|
7472c69954 | ||
|
|
3f8819827c | ||
|
|
c440756410 | ||
|
|
0e600eb921 | ||
|
|
a1df835e28 | ||
|
|
119ddf6ccf | ||
|
|
90f447b79d | ||
|
|
7dd71f4126 | ||
|
|
8532d72276 | ||
|
|
d3ff47f572 | ||
|
|
8cc768254f | ||
|
|
5c80743c9c | ||
|
|
5bba3e3c75 | ||
|
|
6caf702417 | ||
|
|
32f668f5e7 | ||
|
|
a91f9d5832 | ||
|
|
547acde6cd | ||
|
|
bea6532881 | ||
|
|
8e2fe6b22e | ||
|
|
4d75e1ef81 | ||
|
|
4c7c00268c | ||
|
|
f28abb953d | ||
|
|
4df39d7304 | ||
|
|
bfc7338246 | ||
|
|
35dac6e6c8 | ||
|
|
e619e8703e | ||
|
|
6fd35bfe32 | ||
|
|
547a431b0d | ||
|
|
f8c01c6341 | ||
|
|
1145700f87 | ||
|
|
44339f5b70 | ||
|
|
7b4a9c1d82 | ||
|
|
3b2fc27de4 | ||
|
|
0b6492e7d3 | ||
|
|
7cfaecbeb6 | ||
|
|
472acae615 | ||
|
|
108bf56e44 | ||
|
|
e83a499ab4 | ||
|
|
ebf3bfadde | ||
|
|
ab06240fae | ||
|
|
cec216c5c0 | ||
|
|
930201e033 | ||
|
|
8328580dc2 | ||
|
|
8d9b632f2a | ||
|
|
55d37c77b9 | ||
|
|
0948fb6bf1 | ||
|
|
285c6d2974 | ||
|
|
a5491463e1 | ||
|
|
a58827f952 | ||
|
|
36b790f282 | ||
|
|
3ef7748e6b | ||
|
|
f3310143e4 | ||
|
|
05b4169644 | ||
|
|
d1495755e7 | ||
|
|
c8dd78c6c8 | ||
|
|
b44ee3950a | ||
|
|
64334f497d | ||
|
|
5ffcb688cc | ||
|
|
32fc2dd683 | ||
|
|
d35ddfbab7 | ||
|
|
3ee82a9895 | ||
|
|
e770aeee92 | ||
|
|
32828cddd6 | ||
|
|
bd2046e1ab | ||
|
|
7e2a3d2728 | ||
|
|
0e4832308d | ||
|
|
0a63bc4818 | ||
|
|
2897dcc9aa | ||
|
|
1d0ec50ddb | ||
|
|
a86b43fcd7 | ||
|
|
b917868ada | ||
|
|
7b7d16f52e | ||
|
|
fee4169b6b | ||
|
|
47e06a2cc6 | ||
|
|
c4423c0623 | ||
|
|
a11cf03123 | ||
|
|
08b33adfee | ||
|
|
4fb50144dd | ||
|
|
c500137ca9 | ||
|
|
252c4acec9 | ||
|
|
db70c175e6 | ||
|
|
ed3b4a58b4 | ||
|
|
2863d1df63 | ||
|
|
320b24eab3 | ||
|
|
13a8a5b09b | ||
|
|
64ccdf65e0 | ||
|
|
1ae6aa09dd | ||
|
|
aeb68e51df | ||
|
|
c3e5223a5d | ||
|
|
daaa3211a4 | ||
|
|
7ff9989dd5 | ||
|
|
ed3b97604c | ||
|
|
47c50ec460 | ||
|
|
8c0ec2f681 | ||
|
|
588bda98e7 | ||
|
|
504ca7720f | ||
|
|
cf4ea92aad | ||
|
|
325294bced | ||
|
|
86c8ba2563 | ||
|
|
feeb2dc6fa | ||
|
|
57f476ff5a | ||
|
|
7ee2bebdb7 | ||
|
|
be598f1bf4 | ||
|
|
939b5954a5 | ||
|
|
371020fe6a | ||
|
|
f45818abed | ||
|
|
0384267d58 | ||
|
|
62b3bd968a | ||
|
|
e3e3bc3542 | ||
|
|
be014a2222 | ||
|
|
2e1fe71cc0 | ||
|
|
068c158ca5 | ||
|
|
b16e4f689f | ||
|
|
dbff725a0c | ||
|
|
7fa4628434 | ||
|
|
fc538a38b9 | ||
|
|
c2e7cb324f | ||
|
|
101043122e | ||
|
|
c4d7d59825 | ||
|
|
0de1e1d664 | ||
|
|
271598b77f | ||
|
|
459bc479dc | ||
|
|
c213373a59 | ||
|
|
e0addc100d | ||
|
|
0519138b04 | ||
|
|
5da39b469c | ||
|
|
82027e22dd | ||
|
|
c431e2f1c5 | ||
|
|
4e5724d9c3 | ||
|
|
0d3e499059 | ||
|
|
7b860b837c | ||
|
|
41fc96e20f | ||
|
|
fb2b1ce57b | ||
|
|
464717451b | ||
|
|
c6ed86d3d0 | ||
|
|
f0a9017008 | ||
|
|
bb7949ba00 | ||
|
|
1df0f69664 | ||
|
|
970066a914 | ||
|
|
1ebd3897c0 | ||
|
|
6460beffcd | ||
|
|
6f7f8958db | ||
|
|
936a00e077 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
12
.github/actions/run-python-test-set/action.yml
vendored
12
.github/actions/run-python-test-set/action.yml
vendored
@@ -114,6 +114,8 @@ runs:
|
|||||||
export PLATFORM=${PLATFORM:-github-actions-selfhosted}
|
export PLATFORM=${PLATFORM:-github-actions-selfhosted}
|
||||||
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
|
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
|
||||||
export DEFAULT_PG_VERSION=${PG_VERSION#v}
|
export DEFAULT_PG_VERSION=${PG_VERSION#v}
|
||||||
|
export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
|
||||||
|
export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
|
||||||
|
|
||||||
if [ "${BUILD_TYPE}" = "remote" ]; then
|
if [ "${BUILD_TYPE}" = "remote" ]; then
|
||||||
export REMOTE_ENV=1
|
export REMOTE_ENV=1
|
||||||
@@ -178,7 +180,15 @@ runs:
|
|||||||
|
|
||||||
# Wake up the cluster if we use remote neon instance
|
# Wake up the cluster if we use remote neon instance
|
||||||
if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
|
if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
|
||||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
|
QUERIES=("SELECT version()")
|
||||||
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
|
QUERIES+=("SHOW neon.tenant_id")
|
||||||
|
QUERIES+=("SHOW neon.timeline_id")
|
||||||
|
fi
|
||||||
|
|
||||||
|
for q in "${QUERIES[@]}"; do
|
||||||
|
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}"
|
||||||
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Run the tests.
|
# Run the tests.
|
||||||
|
|||||||
170
.github/workflows/benchmarking.yml
vendored
170
.github/workflows/benchmarking.yml
vendored
@@ -56,15 +56,26 @@ concurrency:
|
|||||||
jobs:
|
jobs:
|
||||||
bench:
|
bench:
|
||||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- DEFAULT_PG_VERSION: 14
|
||||||
|
PLATFORM: "neon-staging"
|
||||||
|
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||||
|
provisioner: 'k8s-pod'
|
||||||
|
- DEFAULT_PG_VERSION: 16
|
||||||
|
PLATFORM: "azure-staging"
|
||||||
|
region_id: 'azure-eastus2'
|
||||||
|
provisioner: 'k8s-neonvm'
|
||||||
env:
|
env:
|
||||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
DEFAULT_PG_VERSION: 14
|
DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
|
||||||
TEST_OUTPUT: /tmp/test_output
|
TEST_OUTPUT: /tmp/test_output
|
||||||
BUILD_TYPE: remote
|
BUILD_TYPE: remote
|
||||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||||
PLATFORM: "neon-staging"
|
PLATFORM: ${{ matrix.PLATFORM }}
|
||||||
|
|
||||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||||
container:
|
container:
|
||||||
@@ -85,9 +96,10 @@ jobs:
|
|||||||
id: create-neon-project
|
id: create-neon-project
|
||||||
uses: ./.github/actions/neon-project-create
|
uses: ./.github/actions/neon-project-create
|
||||||
with:
|
with:
|
||||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
region_id: ${{ matrix.region_id }}
|
||||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
provisioner: ${{ matrix.provisioner }}
|
||||||
|
|
||||||
- name: Run benchmark
|
- name: Run benchmark
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -96,10 +108,18 @@ jobs:
|
|||||||
test_selection: performance
|
test_selection: performance
|
||||||
run_in_parallel: false
|
run_in_parallel: false
|
||||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||||
|
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||||
# Set --sparse-ordering option of pytest-order plugin
|
# Set --sparse-ordering option of pytest-order plugin
|
||||||
# to ensure tests are running in order of appears in the file.
|
# to ensure tests are running in order of appears in the file.
|
||||||
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
|
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
|
||||||
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
|
extra_params:
|
||||||
|
-m remote_cluster
|
||||||
|
--sparse-ordering
|
||||||
|
--timeout 14400
|
||||||
|
--ignore test_runner/performance/test_perf_olap.py
|
||||||
|
--ignore test_runner/performance/test_perf_pgvector_queries.py
|
||||||
|
--ignore test_runner/performance/test_logical_replication.py
|
||||||
|
--ignore test_runner/performance/test_physical_replication.py
|
||||||
env:
|
env:
|
||||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
@@ -125,6 +145,69 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
|
replication-tests:
|
||||||
|
env:
|
||||||
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
|
DEFAULT_PG_VERSION: 14
|
||||||
|
TEST_OUTPUT: /tmp/test_output
|
||||||
|
BUILD_TYPE: remote
|
||||||
|
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||||
|
PLATFORM: "neon-staging"
|
||||||
|
|
||||||
|
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||||
|
container:
|
||||||
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||||
|
options: --init
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Download Neon artifact
|
||||||
|
uses: ./.github/actions/download
|
||||||
|
with:
|
||||||
|
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||||
|
path: /tmp/neon/
|
||||||
|
prefix: latest
|
||||||
|
|
||||||
|
- name: Run benchmark
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: ${{ env.BUILD_TYPE }}
|
||||||
|
test_selection: performance/test_logical_replication.py
|
||||||
|
run_in_parallel: false
|
||||||
|
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||||
|
extra_params: -m remote_cluster --timeout 5400
|
||||||
|
env:
|
||||||
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
|
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
|
||||||
|
- name: Run benchmark
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: ${{ env.BUILD_TYPE }}
|
||||||
|
test_selection: performance/test_physical_replication.py
|
||||||
|
run_in_parallel: false
|
||||||
|
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||||
|
extra_params: -m remote_cluster --timeout 5400
|
||||||
|
env:
|
||||||
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
|
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
|
||||||
|
- name: Create Allure report
|
||||||
|
if: ${{ !cancelled() }}
|
||||||
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
|
||||||
|
- name: Post to a Slack channel
|
||||||
|
if: ${{ github.event.schedule && failure() }}
|
||||||
|
uses: slackapi/slack-github-action@v1
|
||||||
|
with:
|
||||||
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
|
slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
|
env:
|
||||||
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
generate-matrices:
|
generate-matrices:
|
||||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||||
@@ -239,11 +322,6 @@ jobs:
|
|||||||
path: /tmp/neon/
|
path: /tmp/neon/
|
||||||
prefix: latest
|
prefix: latest
|
||||||
|
|
||||||
- name: Add Postgres binaries to PATH
|
|
||||||
run: |
|
|
||||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
|
||||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
|
||||||
|
|
||||||
- name: Create Neon Project
|
- name: Create Neon Project
|
||||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
|
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
|
||||||
id: create-neon-project
|
id: create-neon-project
|
||||||
@@ -282,16 +360,6 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERIES=("SELECT version()")
|
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
|
||||||
QUERIES+=("SHOW neon.tenant_id")
|
|
||||||
QUERIES+=("SHOW neon.timeline_id")
|
|
||||||
fi
|
|
||||||
|
|
||||||
for q in "${QUERIES[@]}"; do
|
|
||||||
psql ${CONNSTR} -c "${q}"
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Benchmark init
|
- name: Benchmark init
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
with:
|
with:
|
||||||
@@ -377,25 +445,12 @@ jobs:
|
|||||||
path: /tmp/neon/
|
path: /tmp/neon/
|
||||||
prefix: latest
|
prefix: latest
|
||||||
|
|
||||||
- name: Add Postgres binaries to PATH
|
|
||||||
run: |
|
|
||||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
|
||||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
|
||||||
|
|
||||||
- name: Set up Connection String
|
- name: Set up Connection String
|
||||||
id: set-up-connstr
|
id: set-up-connstr
|
||||||
run: |
|
run: |
|
||||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
QUERIES=("SELECT version()")
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
QUERIES+=("SHOW neon.tenant_id")
|
|
||||||
QUERIES+=("SHOW neon.timeline_id")
|
|
||||||
|
|
||||||
for q in "${QUERIES[@]}"; do
|
|
||||||
psql ${CONNSTR} -c "${q}"
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Benchmark pgvector hnsw indexing
|
- name: Benchmark pgvector hnsw indexing
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -417,12 +472,12 @@ jobs:
|
|||||||
test_selection: performance/test_perf_pgvector_queries.py
|
test_selection: performance/test_perf_pgvector_queries.py
|
||||||
run_in_parallel: false
|
run_in_parallel: false
|
||||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||||
extra_params: -m remote_cluster --timeout 21600
|
extra_params: -m remote_cluster --timeout 21600
|
||||||
env:
|
env:
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
|
|
||||||
- name: Create Allure report
|
- name: Create Allure report
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
uses: ./.github/actions/allure-report-generate
|
uses: ./.github/actions/allure-report-generate
|
||||||
@@ -477,11 +532,6 @@ jobs:
|
|||||||
path: /tmp/neon/
|
path: /tmp/neon/
|
||||||
prefix: latest
|
prefix: latest
|
||||||
|
|
||||||
- name: Add Postgres binaries to PATH
|
|
||||||
run: |
|
|
||||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
|
||||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
|
||||||
|
|
||||||
- name: Set up Connection String
|
- name: Set up Connection String
|
||||||
id: set-up-connstr
|
id: set-up-connstr
|
||||||
run: |
|
run: |
|
||||||
@@ -503,16 +553,6 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERIES=("SELECT version()")
|
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
|
||||||
QUERIES+=("SHOW neon.tenant_id")
|
|
||||||
QUERIES+=("SHOW neon.timeline_id")
|
|
||||||
fi
|
|
||||||
|
|
||||||
for q in "${QUERIES[@]}"; do
|
|
||||||
psql ${CONNSTR} -c "${q}"
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: ClickBench benchmark
|
- name: ClickBench benchmark
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
with:
|
with:
|
||||||
@@ -580,11 +620,6 @@ jobs:
|
|||||||
path: /tmp/neon/
|
path: /tmp/neon/
|
||||||
prefix: latest
|
prefix: latest
|
||||||
|
|
||||||
- name: Add Postgres binaries to PATH
|
|
||||||
run: |
|
|
||||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
|
||||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
|
||||||
|
|
||||||
- name: Get Connstring Secret Name
|
- name: Get Connstring Secret Name
|
||||||
run: |
|
run: |
|
||||||
case "${PLATFORM}" in
|
case "${PLATFORM}" in
|
||||||
@@ -613,16 +648,6 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERIES=("SELECT version()")
|
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
|
||||||
QUERIES+=("SHOW neon.tenant_id")
|
|
||||||
QUERIES+=("SHOW neon.timeline_id")
|
|
||||||
fi
|
|
||||||
|
|
||||||
for q in "${QUERIES[@]}"; do
|
|
||||||
psql ${CONNSTR} -c "${q}"
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Run TPC-H benchmark
|
- name: Run TPC-H benchmark
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
with:
|
with:
|
||||||
@@ -681,11 +706,6 @@ jobs:
|
|||||||
path: /tmp/neon/
|
path: /tmp/neon/
|
||||||
prefix: latest
|
prefix: latest
|
||||||
|
|
||||||
- name: Add Postgres binaries to PATH
|
|
||||||
run: |
|
|
||||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
|
||||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
|
||||||
|
|
||||||
- name: Set up Connection String
|
- name: Set up Connection String
|
||||||
id: set-up-connstr
|
id: set-up-connstr
|
||||||
run: |
|
run: |
|
||||||
@@ -707,16 +727,6 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERIES=("SELECT version()")
|
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
|
||||||
QUERIES+=("SHOW neon.tenant_id")
|
|
||||||
QUERIES+=("SHOW neon.timeline_id")
|
|
||||||
fi
|
|
||||||
|
|
||||||
for q in "${QUERIES[@]}"; do
|
|
||||||
psql ${CONNSTR} -c "${q}"
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Run user examples
|
- name: Run user examples
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
with:
|
with:
|
||||||
|
|||||||
@@ -63,14 +63,16 @@ jobs:
|
|||||||
mkdir -p /tmp/.docker-custom
|
mkdir -p /tmp/.docker-custom
|
||||||
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
|
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
|
||||||
|
|
||||||
- uses: docker/setup-buildx-action@v2
|
- uses: docker/setup-buildx-action@v3
|
||||||
|
with:
|
||||||
|
cache-binary: false
|
||||||
|
|
||||||
- uses: docker/login-action@v2
|
- uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
|
||||||
- uses: docker/build-push-action@v4
|
- uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
provenance: false
|
provenance: false
|
||||||
@@ -82,6 +84,7 @@ jobs:
|
|||||||
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
|
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
|
||||||
|
|
||||||
- name: Remove custom docker config directory
|
- name: Remove custom docker config directory
|
||||||
|
if: always()
|
||||||
run: |
|
run: |
|
||||||
rm -rf /tmp/.docker-custom
|
rm -rf /tmp/.docker-custom
|
||||||
|
|
||||||
|
|||||||
57
.github/workflows/build_and_test.yml
vendored
57
.github/workflows/build_and_test.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
|||||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||||
uses: ./.github/workflows/check-permissions.yml
|
uses: ./.github/workflows/check-permissions.yml
|
||||||
with:
|
with:
|
||||||
github-event-name: ${{ github.event_name}}
|
github-event-name: ${{ github.event_name }}
|
||||||
|
|
||||||
cancel-previous-e2e-tests:
|
cancel-previous-e2e-tests:
|
||||||
needs: [ check-permissions ]
|
needs: [ check-permissions ]
|
||||||
@@ -335,6 +335,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Run cargo build
|
- name: Run cargo build
|
||||||
run: |
|
run: |
|
||||||
|
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||||
|
export PQ_LIB_DIR
|
||||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||||
|
|
||||||
# Do install *before* running rust tests because they might recompile the
|
# Do install *before* running rust tests because they might recompile the
|
||||||
@@ -383,6 +385,11 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
NEXTEST_RETRIES: 3
|
NEXTEST_RETRIES: 3
|
||||||
run: |
|
run: |
|
||||||
|
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||||
|
export PQ_LIB_DIR
|
||||||
|
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
|
||||||
|
export LD_LIBRARY_PATH
|
||||||
|
|
||||||
#nextest does not yet support running doctests
|
#nextest does not yet support running doctests
|
||||||
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||||
|
|
||||||
@@ -744,14 +751,16 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir -p .docker-custom
|
mkdir -p .docker-custom
|
||||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||||
- uses: docker/setup-buildx-action@v2
|
- uses: docker/setup-buildx-action@v3
|
||||||
|
with:
|
||||||
|
cache-binary: false
|
||||||
|
|
||||||
- uses: docker/login-action@v3
|
- uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
|
||||||
- uses: docker/build-push-action@v5
|
- uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
build-args: |
|
build-args: |
|
||||||
@@ -822,11 +831,12 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir -p .docker-custom
|
mkdir -p .docker-custom
|
||||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||||
- uses: docker/setup-buildx-action@v2
|
- uses: docker/setup-buildx-action@v3
|
||||||
with:
|
with:
|
||||||
|
cache-binary: false
|
||||||
# Disable parallelism for docker buildkit.
|
# Disable parallelism for docker buildkit.
|
||||||
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
|
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
|
||||||
config-inline: |
|
buildkitd-config-inline: |
|
||||||
[worker.oci]
|
[worker.oci]
|
||||||
max-parallelism = 1
|
max-parallelism = 1
|
||||||
|
|
||||||
@@ -842,7 +852,7 @@ jobs:
|
|||||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||||
|
|
||||||
- name: Build compute-node image
|
- name: Build compute-node image
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
build-args: |
|
build-args: |
|
||||||
@@ -861,7 +871,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Build neon extensions test image
|
- name: Build neon extensions test image
|
||||||
if: matrix.version == 'v16'
|
if: matrix.version == 'v16'
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
build-args: |
|
build-args: |
|
||||||
@@ -882,7 +892,7 @@ jobs:
|
|||||||
- name: Build compute-tools image
|
- name: Build compute-tools image
|
||||||
# compute-tools are Postgres independent, so build it only once
|
# compute-tools are Postgres independent, so build it only once
|
||||||
if: matrix.version == 'v16'
|
if: matrix.version == 'v16'
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
target: compute-tools-image
|
target: compute-tools-image
|
||||||
context: .
|
context: .
|
||||||
@@ -1326,6 +1336,7 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
BUCKET: neon-github-public-dev
|
BUCKET: neon-github-public-dev
|
||||||
PREFIX: artifacts/latest
|
PREFIX: artifacts/latest
|
||||||
|
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
run: |
|
run: |
|
||||||
# Update compatibility snapshot for the release
|
# Update compatibility snapshot for the release
|
||||||
for pg_version in v14 v15 v16; do
|
for pg_version in v14 v15 v16; do
|
||||||
@@ -1339,7 +1350,7 @@ jobs:
|
|||||||
|
|
||||||
# Update Neon artifact for the release (reuse already uploaded artifact)
|
# Update Neon artifact for the release (reuse already uploaded artifact)
|
||||||
for build_type in debug release; do
|
for build_type in debug release; do
|
||||||
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
|
OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
|
||||||
FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
|
FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
|
||||||
|
|
||||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||||
@@ -1358,3 +1369,31 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
|
from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
|
# This job simplifies setting branch protection rules (in GitHub UI)
|
||||||
|
# by allowing to set only this job instead of listing many others.
|
||||||
|
# It also makes it easier to rename or parametrise jobs (using matrix)
|
||||||
|
# which requires changes in branch protection rules
|
||||||
|
#
|
||||||
|
# Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that.
|
||||||
|
#
|
||||||
|
# https://github.com/neondatabase/neon/settings/branch_protection_rules
|
||||||
|
conclusion:
|
||||||
|
if: always()
|
||||||
|
# Format `needs` differently to make the list more readable.
|
||||||
|
# Usually we do `needs: [...]`
|
||||||
|
needs:
|
||||||
|
- check-codestyle-python
|
||||||
|
- check-codestyle-rust
|
||||||
|
- regress-tests
|
||||||
|
- test-images
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
steps:
|
||||||
|
# The list of possible results:
|
||||||
|
# https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context
|
||||||
|
- name: Fail the job if any of the dependencies do not succeed
|
||||||
|
run: exit 1
|
||||||
|
if: |
|
||||||
|
contains(needs.*.result, 'failure')
|
||||||
|
|| contains(needs.*.result, 'cancelled')
|
||||||
|
|| contains(needs.*.result, 'skipped')
|
||||||
|
|||||||
9
.github/workflows/neon_extra_builds.yml
vendored
9
.github/workflows/neon_extra_builds.yml
vendored
@@ -232,12 +232,19 @@ jobs:
|
|||||||
|
|
||||||
- name: Run cargo build
|
- name: Run cargo build
|
||||||
run: |
|
run: |
|
||||||
|
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||||
|
export PQ_LIB_DIR
|
||||||
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
|
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
|
||||||
|
|
||||||
- name: Run cargo test
|
- name: Run cargo test
|
||||||
env:
|
env:
|
||||||
NEXTEST_RETRIES: 3
|
NEXTEST_RETRIES: 3
|
||||||
run: |
|
run: |
|
||||||
|
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||||
|
export PQ_LIB_DIR
|
||||||
|
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
|
||||||
|
export LD_LIBRARY_PATH
|
||||||
|
|
||||||
cargo nextest run $CARGO_FEATURES -j$(nproc)
|
cargo nextest run $CARGO_FEATURES -j$(nproc)
|
||||||
|
|
||||||
# Run separate tests for real S3
|
# Run separate tests for real S3
|
||||||
@@ -378,7 +385,7 @@ jobs:
|
|||||||
run: make walproposer-lib -j$(nproc)
|
run: make walproposer-lib -j$(nproc)
|
||||||
|
|
||||||
- name: Produce the build stats
|
- name: Produce the build stats
|
||||||
run: cargo build --all --release --timings -j$(nproc)
|
run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
|
||||||
|
|
||||||
- name: Upload the build stats
|
- name: Upload the build stats
|
||||||
id: upload-stats
|
id: upload-stats
|
||||||
|
|||||||
155
.github/workflows/periodic_pagebench.yml
vendored
Normal file
155
.github/workflows/periodic_pagebench.yml
vendored
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
# * is a special character in YAML so you have to quote this string
|
||||||
|
# ┌───────────── minute (0 - 59)
|
||||||
|
# │ ┌───────────── hour (0 - 23)
|
||||||
|
# │ │ ┌───────────── day of the month (1 - 31)
|
||||||
|
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||||
|
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||||
|
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
|
||||||
|
workflow_dispatch: # Allows manual triggering of the workflow
|
||||||
|
inputs:
|
||||||
|
commit_hash:
|
||||||
|
type: string
|
||||||
|
description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
|
||||||
|
required: false
|
||||||
|
default: ''
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash -euo pipefail {0}
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
trigger_bench_on_ec2_machine_in_eu_central_1:
|
||||||
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
|
container:
|
||||||
|
image: neondatabase/build-tools:pinned
|
||||||
|
credentials:
|
||||||
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
options: --init
|
||||||
|
timeout-minutes: 360 # Set the timeout to 6 hours
|
||||||
|
env:
|
||||||
|
API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
|
||||||
|
RUN_ID: ${{ github.run_id }}
|
||||||
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
|
||||||
|
AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
|
||||||
|
AWS_DEFAULT_REGION : "eu-central-1"
|
||||||
|
AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
|
||||||
|
steps:
|
||||||
|
# we don't need the neon source code because we run everything remotely
|
||||||
|
# however we still need the local github actions to run the allure step below
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Show my own (github runner) external IP address - usefull for IP allowlisting
|
||||||
|
run: curl https://ifconfig.me
|
||||||
|
|
||||||
|
- name: Start EC2 instance and wait for the instance to boot up
|
||||||
|
run: |
|
||||||
|
aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
|
||||||
|
aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
|
||||||
|
sleep 60 # sleep some time to allow cloudinit and our API server to start up
|
||||||
|
|
||||||
|
- name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
|
||||||
|
run: |
|
||||||
|
public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
|
||||||
|
echo "Public IP of the EC2 instance: $public_ip"
|
||||||
|
echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Determine commit hash
|
||||||
|
env:
|
||||||
|
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
|
||||||
|
run: |
|
||||||
|
if [ -z "$INPUT_COMMIT_HASH" ]; then
|
||||||
|
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
|
||||||
|
else
|
||||||
|
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Start Bench with run_id
|
||||||
|
run: |
|
||||||
|
curl -k -X 'POST' \
|
||||||
|
"${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
|
||||||
|
-H 'accept: application/json' \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-H "Authorization: Bearer $API_KEY" \
|
||||||
|
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
|
||||||
|
|
||||||
|
- name: Poll Test Status
|
||||||
|
id: poll_step
|
||||||
|
run: |
|
||||||
|
status=""
|
||||||
|
while [[ "$status" != "failure" && "$status" != "success" ]]; do
|
||||||
|
response=$(curl -k -X 'GET' \
|
||||||
|
"${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
|
||||||
|
-H 'accept: application/json' \
|
||||||
|
-H "Authorization: Bearer $API_KEY")
|
||||||
|
echo "Response: $response"
|
||||||
|
set +x
|
||||||
|
status=$(echo $response | jq -r '.status')
|
||||||
|
echo "Test status: $status"
|
||||||
|
if [[ "$status" == "failure" ]]; then
|
||||||
|
echo "Test failed"
|
||||||
|
exit 1 # Fail the job step if status is failure
|
||||||
|
elif [[ "$status" == "success" || "$status" == "null" ]]; then
|
||||||
|
break
|
||||||
|
elif [[ "$status" == "too_many_runs" ]]; then
|
||||||
|
echo "Too many runs already running"
|
||||||
|
echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 60 # Poll every 60 seconds
|
||||||
|
done
|
||||||
|
|
||||||
|
- name: Retrieve Test Logs
|
||||||
|
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||||
|
run: |
|
||||||
|
curl -k -X 'GET' \
|
||||||
|
"${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
|
||||||
|
-H 'accept: application/gzip' \
|
||||||
|
-H "Authorization: Bearer $API_KEY" \
|
||||||
|
--output "test_log_${GITHUB_RUN_ID}.gz"
|
||||||
|
|
||||||
|
- name: Unzip Test Log and Print it into this job's log
|
||||||
|
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||||
|
run: |
|
||||||
|
gzip -d "test_log_${GITHUB_RUN_ID}.gz"
|
||||||
|
cat "test_log_${GITHUB_RUN_ID}"
|
||||||
|
|
||||||
|
- name: Create Allure report
|
||||||
|
env:
|
||||||
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||||
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||||
|
if: ${{ !cancelled() }}
|
||||||
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
|
||||||
|
- name: Post to a Slack channel
|
||||||
|
if: ${{ github.event.schedule && failure() }}
|
||||||
|
uses: slackapi/slack-github-action@v1
|
||||||
|
with:
|
||||||
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
|
slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
|
env:
|
||||||
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
|
- name: Cleanup Test Resources
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
curl -k -X 'POST' \
|
||||||
|
"${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
|
||||||
|
-H 'accept: application/json' \
|
||||||
|
-H "Authorization: Bearer $API_KEY" \
|
||||||
|
-d ''
|
||||||
|
|
||||||
|
- name: Stop EC2 instance and wait for the instance to be stopped
|
||||||
|
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||||
|
run: |
|
||||||
|
aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
|
||||||
|
aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
|
||||||
115
.github/workflows/pg-clients.yml
vendored
Normal file
115
.github/workflows/pg-clients.yml
vendored
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
name: Test Postgres client libraries
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
# * is a special character in YAML so you have to quote this string
|
||||||
|
# ┌───────────── minute (0 - 59)
|
||||||
|
# │ ┌───────────── hour (0 - 23)
|
||||||
|
# │ │ ┌───────────── day of the month (1 - 31)
|
||||||
|
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||||
|
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||||
|
- cron: '23 02 * * *' # run once a day, timezone is utc
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- '.github/workflows/pg-clients.yml'
|
||||||
|
- 'test_runner/pg_clients/**'
|
||||||
|
- 'poetry.lock'
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref_name }}
|
||||||
|
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash -euxo pipefail {0}
|
||||||
|
|
||||||
|
env:
|
||||||
|
DEFAULT_PG_VERSION: 16
|
||||||
|
PLATFORM: neon-captest-new
|
||||||
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||||
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||||
|
AWS_DEFAULT_REGION: eu-central-1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
check-permissions:
|
||||||
|
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||||
|
uses: ./.github/workflows/check-permissions.yml
|
||||||
|
with:
|
||||||
|
github-event-name: ${{ github.event_name }}
|
||||||
|
|
||||||
|
check-build-tools-image:
|
||||||
|
needs: [ check-permissions ]
|
||||||
|
uses: ./.github/workflows/check-build-tools-image.yml
|
||||||
|
|
||||||
|
build-build-tools-image:
|
||||||
|
needs: [ check-build-tools-image ]
|
||||||
|
uses: ./.github/workflows/build-build-tools-image.yml
|
||||||
|
with:
|
||||||
|
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
test-postgres-client-libs:
|
||||||
|
needs: [ build-build-tools-image ]
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
container:
|
||||||
|
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||||
|
credentials:
|
||||||
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
options: --init --user root
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Download Neon artifact
|
||||||
|
uses: ./.github/actions/download
|
||||||
|
with:
|
||||||
|
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||||
|
path: /tmp/neon/
|
||||||
|
prefix: latest
|
||||||
|
|
||||||
|
- name: Create Neon Project
|
||||||
|
id: create-neon-project
|
||||||
|
uses: ./.github/actions/neon-project-create
|
||||||
|
with:
|
||||||
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
with:
|
||||||
|
build_type: remote
|
||||||
|
test_selection: pg_clients
|
||||||
|
run_in_parallel: false
|
||||||
|
extra_params: -m remote_cluster
|
||||||
|
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||||
|
env:
|
||||||
|
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||||
|
|
||||||
|
- name: Delete Neon Project
|
||||||
|
if: always()
|
||||||
|
uses: ./.github/actions/neon-project-delete
|
||||||
|
with:
|
||||||
|
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||||
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
|
||||||
|
- name: Create Allure report
|
||||||
|
if: ${{ !cancelled() }}
|
||||||
|
id: create-allure-report
|
||||||
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
with:
|
||||||
|
store-test-results-into-db: true
|
||||||
|
env:
|
||||||
|
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||||
|
|
||||||
|
- name: Post to a Slack channel
|
||||||
|
if: github.event.schedule && failure()
|
||||||
|
uses: slackapi/slack-github-action@v1
|
||||||
|
with:
|
||||||
|
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||||
|
slack-message: |
|
||||||
|
Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
|
||||||
|
env:
|
||||||
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
98
.github/workflows/pg_clients.yml
vendored
98
.github/workflows/pg_clients.yml
vendored
@@ -1,98 +0,0 @@
|
|||||||
name: Test Postgres client libraries
|
|
||||||
|
|
||||||
on:
|
|
||||||
schedule:
|
|
||||||
# * is a special character in YAML so you have to quote this string
|
|
||||||
# ┌───────────── minute (0 - 59)
|
|
||||||
# │ ┌───────────── hour (0 - 23)
|
|
||||||
# │ │ ┌───────────── day of the month (1 - 31)
|
|
||||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
|
||||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
|
||||||
- cron: '23 02 * * *' # run once a day, timezone is utc
|
|
||||||
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
# Allow only one workflow per any non-`main` branch.
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
test-postgres-client-libs:
|
|
||||||
# TODO: switch to gen2 runner, requires docker
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
|
|
||||||
env:
|
|
||||||
DEFAULT_PG_VERSION: 14
|
|
||||||
TEST_OUTPUT: /tmp/test_output
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: 3.9
|
|
||||||
|
|
||||||
- name: Install Poetry
|
|
||||||
uses: snok/install-poetry@v1
|
|
||||||
|
|
||||||
- name: Cache poetry deps
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: ~/.cache/pypoetry/virtualenvs
|
|
||||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
|
|
||||||
|
|
||||||
- name: Install Python deps
|
|
||||||
shell: bash -euxo pipefail {0}
|
|
||||||
run: ./scripts/pysync
|
|
||||||
|
|
||||||
- name: Create Neon Project
|
|
||||||
id: create-neon-project
|
|
||||||
uses: ./.github/actions/neon-project-create
|
|
||||||
with:
|
|
||||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
|
||||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
|
||||||
|
|
||||||
- name: Run pytest
|
|
||||||
env:
|
|
||||||
REMOTE_ENV: 1
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
|
||||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
|
||||||
shell: bash -euxo pipefail {0}
|
|
||||||
run: |
|
|
||||||
# Test framework expects we have psql binary;
|
|
||||||
# but since we don't really need it in this test, let's mock it
|
|
||||||
mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
|
|
||||||
./scripts/pytest \
|
|
||||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
|
||||||
--tb=short \
|
|
||||||
--verbose \
|
|
||||||
-m "remote_cluster" \
|
|
||||||
-rA "test_runner/pg_clients"
|
|
||||||
|
|
||||||
- name: Delete Neon Project
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: ./.github/actions/neon-project-delete
|
|
||||||
with:
|
|
||||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
|
||||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
|
||||||
|
|
||||||
# We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
|
|
||||||
# It will be fixed after switching to gen2 runner
|
|
||||||
- name: Upload python test logs
|
|
||||||
if: always()
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
retention-days: 7
|
|
||||||
name: python-test-pg_clients-${{ runner.os }}-${{ runner.arch }}-stage-logs
|
|
||||||
path: ${{ env.TEST_OUTPUT }}
|
|
||||||
|
|
||||||
- name: Post to a Slack channel
|
|
||||||
if: ${{ github.event.schedule && failure() }}
|
|
||||||
uses: slackapi/slack-github-action@v1
|
|
||||||
with:
|
|
||||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
|
||||||
slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
|
||||||
env:
|
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
|
||||||
105
Cargo.lock
generated
105
Cargo.lock
generated
@@ -1236,6 +1236,7 @@ dependencies = [
|
|||||||
"regex",
|
"regex",
|
||||||
"remote_storage",
|
"remote_storage",
|
||||||
"reqwest 0.12.4",
|
"reqwest 0.12.4",
|
||||||
|
"rlimit",
|
||||||
"rust-ini",
|
"rust-ini",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@@ -1397,9 +1398,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crc32c"
|
name = "crc32c"
|
||||||
version = "0.6.5"
|
version = "0.6.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
|
checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"rustc_version",
|
"rustc_version",
|
||||||
]
|
]
|
||||||
@@ -1651,6 +1652,16 @@ dependencies = [
|
|||||||
"rusticata-macros",
|
"rusticata-macros",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "deranged"
|
||||||
|
version = "0.3.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
|
||||||
|
dependencies = [
|
||||||
|
"powerfmt",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "desim"
|
name = "desim"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
@@ -2017,16 +2028,6 @@ dependencies = [
|
|||||||
"tokio-util",
|
"tokio-util",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "fs2"
|
|
||||||
version = "0.4.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
|
|
||||||
dependencies = [
|
|
||||||
"libc",
|
|
||||||
"winapi",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fsevent-sys"
|
name = "fsevent-sys"
|
||||||
version = "4.1.0"
|
version = "4.1.0"
|
||||||
@@ -3008,9 +3009,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "measured"
|
name = "measured"
|
||||||
version = "0.0.21"
|
version = "0.0.22"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
|
checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
@@ -3026,9 +3027,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "measured-derive"
|
name = "measured-derive"
|
||||||
version = "0.0.21"
|
version = "0.0.22"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
|
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"heck 0.5.0",
|
"heck 0.5.0",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
@@ -3038,9 +3039,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "measured-process"
|
name = "measured-process"
|
||||||
version = "0.0.21"
|
version = "0.0.22"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
|
checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"measured",
|
"measured",
|
||||||
@@ -3275,6 +3276,12 @@ dependencies = [
|
|||||||
"num-traits",
|
"num-traits",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "num-conv"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-integer"
|
name = "num-integer"
|
||||||
version = "0.1.45"
|
version = "0.1.45"
|
||||||
@@ -3667,6 +3674,7 @@ dependencies = [
|
|||||||
"sysinfo",
|
"sysinfo",
|
||||||
"tenant_size_model",
|
"tenant_size_model",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
"tikv-jemallocator",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-epoll-uring",
|
"tokio-epoll-uring",
|
||||||
"tokio-io-timeout",
|
"tokio-io-timeout",
|
||||||
@@ -4077,6 +4085,7 @@ dependencies = [
|
|||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
"tokio-postgres-rustls",
|
"tokio-postgres-rustls",
|
||||||
"tokio-rustls 0.25.0",
|
"tokio-rustls 0.25.0",
|
||||||
|
"tokio-util",
|
||||||
"tracing",
|
"tracing",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
@@ -4117,6 +4126,12 @@ dependencies = [
|
|||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "powerfmt"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ppv-lite86"
|
name = "ppv-lite86"
|
||||||
version = "0.2.17"
|
version = "0.2.17"
|
||||||
@@ -4877,6 +4892,15 @@ dependencies = [
|
|||||||
"windows-sys 0.48.0",
|
"windows-sys 0.48.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rlimit"
|
||||||
|
version = "0.10.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "routerify"
|
name = "routerify"
|
||||||
version = "3.0.0"
|
version = "3.0.0"
|
||||||
@@ -5145,7 +5169,6 @@ dependencies = [
|
|||||||
"crc32c",
|
"crc32c",
|
||||||
"desim",
|
"desim",
|
||||||
"fail",
|
"fail",
|
||||||
"fs2",
|
|
||||||
"futures",
|
"futures",
|
||||||
"git-version",
|
"git-version",
|
||||||
"hex",
|
"hex",
|
||||||
@@ -5172,6 +5195,8 @@ dependencies = [
|
|||||||
"sha2",
|
"sha2",
|
||||||
"signal-hook",
|
"signal-hook",
|
||||||
"storage_broker",
|
"storage_broker",
|
||||||
|
"strum",
|
||||||
|
"strum_macros",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-io-timeout",
|
"tokio-io-timeout",
|
||||||
@@ -5396,9 +5421,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde"
|
name = "serde"
|
||||||
version = "1.0.183"
|
version = "1.0.203"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
|
checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde_derive",
|
"serde_derive",
|
||||||
]
|
]
|
||||||
@@ -5415,9 +5440,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_derive"
|
name = "serde_derive"
|
||||||
version = "1.0.183"
|
version = "1.0.203"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
|
checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
@@ -6107,12 +6132,15 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time"
|
name = "time"
|
||||||
version = "0.3.21"
|
version = "0.3.36"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
|
checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"deranged",
|
||||||
"itoa",
|
"itoa",
|
||||||
"js-sys",
|
"js-sys",
|
||||||
|
"num-conv",
|
||||||
|
"powerfmt",
|
||||||
"serde",
|
"serde",
|
||||||
"time-core",
|
"time-core",
|
||||||
"time-macros",
|
"time-macros",
|
||||||
@@ -6120,16 +6148,17 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time-core"
|
name = "time-core"
|
||||||
version = "0.1.1"
|
version = "0.1.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
|
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time-macros"
|
name = "time-macros"
|
||||||
version = "0.2.9"
|
version = "0.2.18"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
|
checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"num-conv",
|
||||||
"time-core",
|
"time-core",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -6472,17 +6501,6 @@ version = "0.3.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
|
checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "trace"
|
|
||||||
version = "0.1.0"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"clap",
|
|
||||||
"pageserver_api",
|
|
||||||
"utils",
|
|
||||||
"workspace_hack",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tracing"
|
name = "tracing"
|
||||||
version = "0.1.37"
|
version = "0.1.37"
|
||||||
@@ -6811,6 +6829,7 @@ dependencies = [
|
|||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tokio-tar",
|
"tokio-tar",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
|
"toml_edit 0.19.10",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-error",
|
"tracing-error",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
@@ -7426,13 +7445,12 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"clap_builder",
|
"clap_builder",
|
||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
|
"deranged",
|
||||||
"either",
|
"either",
|
||||||
"fail",
|
"fail",
|
||||||
"futures-channel",
|
"futures-channel",
|
||||||
"futures-core",
|
|
||||||
"futures-executor",
|
"futures-executor",
|
||||||
"futures-io",
|
"futures-io",
|
||||||
"futures-sink",
|
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"getrandom 0.2.11",
|
"getrandom 0.2.11",
|
||||||
"hashbrown 0.14.5",
|
"hashbrown 0.14.5",
|
||||||
@@ -7450,7 +7468,9 @@ dependencies = [
|
|||||||
"num-traits",
|
"num-traits",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"parquet",
|
"parquet",
|
||||||
|
"proc-macro2",
|
||||||
"prost",
|
"prost",
|
||||||
|
"quote",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"regex",
|
"regex",
|
||||||
"regex-automata 0.4.3",
|
"regex-automata 0.4.3",
|
||||||
@@ -7467,6 +7487,7 @@ dependencies = [
|
|||||||
"syn 1.0.109",
|
"syn 1.0.109",
|
||||||
"syn 2.0.52",
|
"syn 2.0.52",
|
||||||
"sync_wrapper",
|
"sync_wrapper",
|
||||||
|
"tikv-jemalloc-sys",
|
||||||
"time",
|
"time",
|
||||||
"time-macros",
|
"time-macros",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ members = [
|
|||||||
"storage_controller",
|
"storage_controller",
|
||||||
"storage_scrubber",
|
"storage_scrubber",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
"trace",
|
|
||||||
"libs/compute_api",
|
"libs/compute_api",
|
||||||
"libs/pageserver_api",
|
"libs/pageserver_api",
|
||||||
"libs/postgres_ffi",
|
"libs/postgres_ffi",
|
||||||
@@ -84,7 +83,6 @@ enumset = "1.0.12"
|
|||||||
fail = "0.5.0"
|
fail = "0.5.0"
|
||||||
fallible-iterator = "0.2"
|
fallible-iterator = "0.2"
|
||||||
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
|
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
|
||||||
fs2 = "0.4.3"
|
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
futures-core = "0.3"
|
futures-core = "0.3"
|
||||||
futures-util = "0.3"
|
futures-util = "0.3"
|
||||||
@@ -111,8 +109,8 @@ lasso = "0.7"
|
|||||||
leaky-bucket = "1.0.1"
|
leaky-bucket = "1.0.1"
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
measured = { version = "0.0.21", features=["lasso"] }
|
measured = { version = "0.0.22", features=["lasso"] }
|
||||||
measured-process = { version = "0.0.21" }
|
measured-process = { version = "0.0.22" }
|
||||||
memoffset = "0.8"
|
memoffset = "0.8"
|
||||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||||
notify = "6.0.0"
|
notify = "6.0.0"
|
||||||
|
|||||||
@@ -42,12 +42,13 @@ ARG CACHEPOT_BUCKET=neon-github-dev
|
|||||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||||
|
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
|
||||||
COPY --chown=nonroot . .
|
COPY --chown=nonroot . .
|
||||||
|
|
||||||
# Show build caching stats to check if it was used in the end.
|
# Show build caching stats to check if it was used in the end.
|
||||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||||
RUN set -e \
|
RUN set -e \
|
||||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
||||||
--bin pg_sni_router \
|
--bin pg_sni_router \
|
||||||
--bin pageserver \
|
--bin pageserver \
|
||||||
--bin pagectl \
|
--bin pagectl \
|
||||||
@@ -56,6 +57,7 @@ RUN set -e \
|
|||||||
--bin storage_controller \
|
--bin storage_controller \
|
||||||
--bin proxy \
|
--bin proxy \
|
||||||
--bin neon_local \
|
--bin neon_local \
|
||||||
|
--bin storage_scrubber \
|
||||||
--locked --release \
|
--locked --release \
|
||||||
&& cachepot -s
|
&& cachepot -s
|
||||||
|
|
||||||
@@ -82,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker
|
|||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin
|
||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||||
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin
|
||||||
|
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||||
|
|||||||
@@ -1,5 +1,13 @@
|
|||||||
FROM debian:bullseye-slim
|
FROM debian:bullseye-slim
|
||||||
|
|
||||||
|
# Use ARG as a build-time environment variable here to allow.
|
||||||
|
# It's not supposed to be set outside.
|
||||||
|
# Alternatively it can be obtained using the following command
|
||||||
|
# ```
|
||||||
|
# . /etc/os-release && echo "${VERSION_CODENAME}"
|
||||||
|
# ```
|
||||||
|
ARG DEBIAN_VERSION_CODENAME=bullseye
|
||||||
|
|
||||||
# Add nonroot user
|
# Add nonroot user
|
||||||
RUN useradd -ms /bin/bash nonroot -b /home
|
RUN useradd -ms /bin/bash nonroot -b /home
|
||||||
SHELL ["/bin/bash", "-c"]
|
SHELL ["/bin/bash", "-c"]
|
||||||
@@ -26,7 +34,6 @@ RUN set -e \
|
|||||||
liblzma-dev \
|
liblzma-dev \
|
||||||
libncurses5-dev \
|
libncurses5-dev \
|
||||||
libncursesw5-dev \
|
libncursesw5-dev \
|
||||||
libpq-dev \
|
|
||||||
libreadline-dev \
|
libreadline-dev \
|
||||||
libseccomp-dev \
|
libseccomp-dev \
|
||||||
libsqlite3-dev \
|
libsqlite3-dev \
|
||||||
@@ -67,12 +74,24 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
|
|||||||
# LLVM
|
# LLVM
|
||||||
ENV LLVM_VERSION=18
|
ENV LLVM_VERSION=18
|
||||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||||
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
&& echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||||
&& apt update \
|
&& apt update \
|
||||||
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
||||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
||||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||||
|
|
||||||
|
# Install docker
|
||||||
|
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
|
||||||
|
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
|
||||||
|
&& apt update \
|
||||||
|
&& apt install -y docker-ce docker-ce-cli \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||||
|
|
||||||
|
# Configure sudo & docker
|
||||||
|
RUN usermod -aG sudo nonroot && \
|
||||||
|
echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
|
||||||
|
usermod -aG docker nonroot
|
||||||
|
|
||||||
# AWS CLI
|
# AWS CLI
|
||||||
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
|
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
|
||||||
&& unzip -q awscliv2.zip \
|
&& unzip -q awscliv2.zip \
|
||||||
|
|||||||
@@ -44,3 +44,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
|||||||
zstd = "0.13"
|
zstd = "0.13"
|
||||||
bytes = "1.0"
|
bytes = "1.0"
|
||||||
rust-ini = "0.20.0"
|
rust-ini = "0.20.0"
|
||||||
|
rlimit = "0.10.1"
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
//! - Every start is a fresh start, so the data directory is removed and
|
//! - Every start is a fresh start, so the data directory is removed and
|
||||||
//! initialized again on each run.
|
//! initialized again on each run.
|
||||||
//! - If remote_extension_config is provided, it will be used to fetch extensions list
|
//! - If remote_extension_config is provided, it will be used to fetch extensions list
|
||||||
//! and download `shared_preload_libraries` from the remote storage.
|
//! and download `shared_preload_libraries` from the remote storage.
|
||||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||||
//! - Sync safekeepers and get commit LSN.
|
//! - Sync safekeepers and get commit LSN.
|
||||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||||
@@ -33,7 +33,6 @@
|
|||||||
//! -b /usr/local/bin/postgres \
|
//! -b /usr/local/bin/postgres \
|
||||||
//! -r http://pg-ext-s3-gateway \
|
//! -r http://pg-ext-s3-gateway \
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
@@ -64,6 +63,7 @@ use compute_tools::monitor::launch_monitor;
|
|||||||
use compute_tools::params::*;
|
use compute_tools::params::*;
|
||||||
use compute_tools::spec::*;
|
use compute_tools::spec::*;
|
||||||
use compute_tools::swap::resize_swap;
|
use compute_tools::swap::resize_swap;
|
||||||
|
use rlimit::{setrlimit, Resource};
|
||||||
|
|
||||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||||
// in-case of not-set environment var
|
// in-case of not-set environment var
|
||||||
@@ -72,6 +72,9 @@ const BUILD_TAG_DEFAULT: &str = "latest";
|
|||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
let (build_tag, clap_args) = init()?;
|
let (build_tag, clap_args) = init()?;
|
||||||
|
|
||||||
|
// enable core dumping for all child processes
|
||||||
|
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
|
||||||
|
|
||||||
let (pg_handle, start_pg_result) = {
|
let (pg_handle, start_pg_result) = {
|
||||||
// Enter startup tracing context
|
// Enter startup tracing context
|
||||||
let _startup_context_guard = startup_context_from_env();
|
let _startup_context_guard = startup_context_from_env();
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ pub struct ComputeNode {
|
|||||||
/// - we push new spec and it does reconfiguration
|
/// - we push new spec and it does reconfiguration
|
||||||
/// - but then something happens and compute pod / VM is destroyed,
|
/// - but then something happens and compute pod / VM is destroyed,
|
||||||
/// so k8s controller starts it again with the **old** spec
|
/// so k8s controller starts it again with the **old** spec
|
||||||
|
///
|
||||||
/// and the same for empty computes:
|
/// and the same for empty computes:
|
||||||
/// - we started compute without any spec
|
/// - we started compute without any spec
|
||||||
/// - we push spec and it does configuration
|
/// - we push spec and it does configuration
|
||||||
@@ -798,7 +799,11 @@ impl ComputeNode {
|
|||||||
// In this case we need to connect with old `zenith_admin` name
|
// In this case we need to connect with old `zenith_admin` name
|
||||||
// and create new user. We cannot simply rename connected user,
|
// and create new user. We cannot simply rename connected user,
|
||||||
// but we can create a new one and grant it all privileges.
|
// but we can create a new one and grant it all privileges.
|
||||||
let connstr = self.connstr.clone();
|
let mut connstr = self.connstr.clone();
|
||||||
|
connstr
|
||||||
|
.query_pairs_mut()
|
||||||
|
.append_pair("application_name", "apply_config");
|
||||||
|
|
||||||
let mut client = match Client::connect(connstr.as_str(), NoTls) {
|
let mut client = match Client::connect(connstr.as_str(), NoTls) {
|
||||||
Err(e) => match e.code() {
|
Err(e) => match e.code() {
|
||||||
Some(&SqlState::INVALID_PASSWORD)
|
Some(&SqlState::INVALID_PASSWORD)
|
||||||
@@ -867,15 +872,19 @@ impl ComputeNode {
|
|||||||
|
|
||||||
// Run migrations separately to not hold up cold starts
|
// Run migrations separately to not hold up cold starts
|
||||||
thread::spawn(move || {
|
thread::spawn(move || {
|
||||||
|
let mut connstr = connstr.clone();
|
||||||
|
connstr
|
||||||
|
.query_pairs_mut()
|
||||||
|
.append_pair("application_name", "migrations");
|
||||||
|
|
||||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||||
handle_migrations(&mut client).context("apply_config handle_migrations")
|
handle_migrations(&mut client).context("apply_config handle_migrations")
|
||||||
});
|
});
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// We could've wrapped this around `pg_ctl reload`, but right now we don't use
|
// Wrapped this around `pg_ctl reload`, but right now we don't use
|
||||||
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
|
// `pg_ctl` for start / stop.
|
||||||
// have opened connection to Postgres and superuser access.
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
fn pg_reload_conf(&self) -> Result<()> {
|
fn pg_reload_conf(&self) -> Result<()> {
|
||||||
let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
|
let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
|
||||||
@@ -1108,7 +1117,7 @@ impl ComputeNode {
|
|||||||
// EKS worker nodes have following core dump settings:
|
// EKS worker nodes have following core dump settings:
|
||||||
// /proc/sys/kernel/core_pattern -> core
|
// /proc/sys/kernel/core_pattern -> core
|
||||||
// /proc/sys/kernel/core_uses_pid -> 1
|
// /proc/sys/kernel/core_uses_pid -> 1
|
||||||
// ulimint -c -> unlimited
|
// ulimit -c -> unlimited
|
||||||
// which results in core dumps being written to postgres data directory as core.<pid>.
|
// which results in core dumps being written to postgres data directory as core.<pid>.
|
||||||
//
|
//
|
||||||
// Use that as a default location and pattern, except macos where core dumps are written
|
// Use that as a default location and pattern, except macos where core dumps are written
|
||||||
@@ -1387,7 +1396,9 @@ pub fn forward_termination_signal() {
|
|||||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||||
if pg_pid != 0 {
|
if pg_pid != 0 {
|
||||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||||
// use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
|
// Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
|
||||||
kill(pg_pid, Signal::SIGQUIT).ok();
|
// ROs to get a list of running xacts faster instead of going through the CLOG.
|
||||||
|
// See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
|
||||||
|
kill(pg_pid, Signal::SIGINT).ok();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ pub mod logger;
|
|||||||
pub mod catalog;
|
pub mod catalog;
|
||||||
pub mod compute;
|
pub mod compute;
|
||||||
pub mod extension_server;
|
pub mod extension_server;
|
||||||
|
mod migration;
|
||||||
pub mod monitor;
|
pub mod monitor;
|
||||||
pub mod params;
|
pub mod params;
|
||||||
pub mod pg_helpers;
|
pub mod pg_helpers;
|
||||||
|
|||||||
100
compute_tools/src/migration.rs
Normal file
100
compute_tools/src/migration.rs
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
use anyhow::{Context, Result};
|
||||||
|
use postgres::Client;
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
pub(crate) struct MigrationRunner<'m> {
|
||||||
|
client: &'m mut Client,
|
||||||
|
migrations: &'m [&'m str],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'m> MigrationRunner<'m> {
|
||||||
|
pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
|
||||||
|
Self { client, migrations }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_migration_id(&mut self) -> Result<i64> {
|
||||||
|
let query = "SELECT id FROM neon_migration.migration_id";
|
||||||
|
let row = self
|
||||||
|
.client
|
||||||
|
.query_one(query, &[])
|
||||||
|
.context("run_migrations get migration_id")?;
|
||||||
|
|
||||||
|
Ok(row.get::<&str, i64>("id"))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn update_migration_id(&mut self) -> Result<()> {
|
||||||
|
let setval = format!(
|
||||||
|
"UPDATE neon_migration.migration_id SET id={}",
|
||||||
|
self.migrations.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
self.client
|
||||||
|
.simple_query(&setval)
|
||||||
|
.context("run_migrations update id")?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn prepare_migrations(&mut self) -> Result<()> {
|
||||||
|
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||||
|
self.client.simple_query(query)?;
|
||||||
|
|
||||||
|
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||||
|
self.client.simple_query(query)?;
|
||||||
|
|
||||||
|
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||||
|
self.client.simple_query(query)?;
|
||||||
|
|
||||||
|
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||||
|
self.client.simple_query(query)?;
|
||||||
|
|
||||||
|
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||||
|
self.client.simple_query(query)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run_migrations(mut self) -> Result<()> {
|
||||||
|
self.prepare_migrations()?;
|
||||||
|
|
||||||
|
let mut current_migration: usize = self.get_migration_id()? as usize;
|
||||||
|
let starting_migration_id = current_migration;
|
||||||
|
|
||||||
|
let query = "BEGIN";
|
||||||
|
self.client
|
||||||
|
.simple_query(query)
|
||||||
|
.context("run_migrations begin")?;
|
||||||
|
|
||||||
|
while current_migration < self.migrations.len() {
|
||||||
|
let migration = self.migrations[current_migration];
|
||||||
|
|
||||||
|
if migration.starts_with("-- SKIP") {
|
||||||
|
info!("Skipping migration id={}", current_migration);
|
||||||
|
} else {
|
||||||
|
info!(
|
||||||
|
"Running migration id={}:\n{}\n",
|
||||||
|
current_migration, migration
|
||||||
|
);
|
||||||
|
self.client.simple_query(migration).with_context(|| {
|
||||||
|
format!("run_migration current_migration={}", current_migration)
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
|
||||||
|
current_migration += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.update_migration_id()?;
|
||||||
|
|
||||||
|
let query = "COMMIT";
|
||||||
|
self.client
|
||||||
|
.simple_query(query)
|
||||||
|
.context("run_migrations commit")?;
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Ran {} migrations",
|
||||||
|
(self.migrations.len() - starting_migration_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||||
|
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
|
||||||
|
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
|
||||||
|
END IF;
|
||||||
|
END $$;
|
||||||
@@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
|
|||||||
/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
|
/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
|
||||||
/// - next line starts with timestamp
|
/// - next line starts with timestamp
|
||||||
/// - EOF
|
/// - EOF
|
||||||
/// - no new lines were written for the last second
|
/// - no new lines were written for the last 100 milliseconds
|
||||||
async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
|
async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
|
||||||
let mut lines = tokio::io::BufReader::new(stderr).lines();
|
let mut lines = tokio::io::BufReader::new(stderr).lines();
|
||||||
let timeout_duration = Duration::from_millis(100);
|
let timeout_duration = Duration::from_millis(100);
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
|
|||||||
|
|
||||||
use crate::config;
|
use crate::config;
|
||||||
use crate::logger::inlinify;
|
use crate::logger::inlinify;
|
||||||
|
use crate::migration::MigrationRunner;
|
||||||
use crate::params::PG_HBA_ALL_MD5;
|
use crate::params::PG_HBA_ALL_MD5;
|
||||||
use crate::pg_helpers::*;
|
use crate::pg_helpers::*;
|
||||||
|
|
||||||
@@ -789,71 +790,12 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
|||||||
"./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
"./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
||||||
),
|
),
|
||||||
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
|
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
|
||||||
|
include_str!(
|
||||||
|
"./migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
|
||||||
|
),
|
||||||
];
|
];
|
||||||
|
|
||||||
let mut func = || {
|
MigrationRunner::new(client, &migrations).run_migrations()?;
|
||||||
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
|
||||||
client.simple_query(query)?;
|
|
||||||
|
|
||||||
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
|
||||||
client.simple_query(query)?;
|
|
||||||
|
|
||||||
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
|
||||||
client.simple_query(query)?;
|
|
||||||
|
|
||||||
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
|
||||||
client.simple_query(query)?;
|
|
||||||
|
|
||||||
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
|
||||||
client.simple_query(query)?;
|
|
||||||
Ok::<_, anyhow::Error>(())
|
|
||||||
};
|
|
||||||
func().context("handle_migrations prepare")?;
|
|
||||||
|
|
||||||
let query = "SELECT id FROM neon_migration.migration_id";
|
|
||||||
let row = client
|
|
||||||
.query_one(query, &[])
|
|
||||||
.context("handle_migrations get migration_id")?;
|
|
||||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
|
||||||
let starting_migration_id = current_migration;
|
|
||||||
|
|
||||||
let query = "BEGIN";
|
|
||||||
client
|
|
||||||
.simple_query(query)
|
|
||||||
.context("handle_migrations begin")?;
|
|
||||||
|
|
||||||
while current_migration < migrations.len() {
|
|
||||||
let migration = &migrations[current_migration];
|
|
||||||
if migration.starts_with("-- SKIP") {
|
|
||||||
info!("Skipping migration id={}", current_migration);
|
|
||||||
} else {
|
|
||||||
info!(
|
|
||||||
"Running migration id={}:\n{}\n",
|
|
||||||
current_migration, migration
|
|
||||||
);
|
|
||||||
client.simple_query(migration).with_context(|| {
|
|
||||||
format!("handle_migrations current_migration={}", current_migration)
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
current_migration += 1;
|
|
||||||
}
|
|
||||||
let setval = format!(
|
|
||||||
"UPDATE neon_migration.migration_id SET id={}",
|
|
||||||
migrations.len()
|
|
||||||
);
|
|
||||||
client
|
|
||||||
.simple_query(&setval)
|
|
||||||
.context("handle_migrations update id")?;
|
|
||||||
|
|
||||||
let query = "COMMIT";
|
|
||||||
client
|
|
||||||
.simple_query(query)
|
|
||||||
.context("handle_migrations commit")?;
|
|
||||||
|
|
||||||
info!(
|
|
||||||
"Ran {} migrations",
|
|
||||||
(migrations.len() - starting_migration_id)
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -325,11 +325,16 @@ impl LocalEnv {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
|
pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
|
||||||
Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
|
Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
|
||||||
|
self.pg_dir(pg_version, "bin")
|
||||||
|
}
|
||||||
|
|
||||||
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
|
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
|
||||||
Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
|
self.pg_dir(pg_version, "lib")
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn pageserver_bin(&self) -> PathBuf {
|
pub fn pageserver_bin(&self) -> PathBuf {
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ use std::time::Duration;
|
|||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use futures::SinkExt;
|
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
|
self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||||
};
|
};
|
||||||
@@ -350,11 +349,6 @@ impl PageServerNode {
|
|||||||
.map(|x| x.parse::<NonZeroU64>())
|
.map(|x| x.parse::<NonZeroU64>())
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||||
trace_read_requests: settings
|
|
||||||
.remove("trace_read_requests")
|
|
||||||
.map(|x| x.parse::<bool>())
|
|
||||||
.transpose()
|
|
||||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
|
||||||
eviction_policy: settings
|
eviction_policy: settings
|
||||||
.remove("eviction_policy")
|
.remove("eviction_policy")
|
||||||
.map(serde_json::from_str)
|
.map(serde_json::from_str)
|
||||||
@@ -455,11 +449,6 @@ impl PageServerNode {
|
|||||||
.map(|x| x.parse::<NonZeroU64>())
|
.map(|x| x.parse::<NonZeroU64>())
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||||
trace_read_requests: settings
|
|
||||||
.remove("trace_read_requests")
|
|
||||||
.map(|x| x.parse::<bool>())
|
|
||||||
.transpose()
|
|
||||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
|
||||||
eviction_policy: settings
|
eviction_policy: settings
|
||||||
.remove("eviction_policy")
|
.remove("eviction_policy")
|
||||||
.map(serde_json::from_str)
|
.map(serde_json::from_str)
|
||||||
@@ -566,60 +555,39 @@ impl PageServerNode {
|
|||||||
pg_wal: Option<(Lsn, PathBuf)>,
|
pg_wal: Option<(Lsn, PathBuf)>,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let (client, conn) = self.page_server_psql_client().await?;
|
|
||||||
// The connection object performs the actual communication with the database,
|
|
||||||
// so spawn it off to run on its own.
|
|
||||||
tokio::spawn(async move {
|
|
||||||
if let Err(e) = conn.await {
|
|
||||||
eprintln!("connection error: {}", e);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
let client = std::pin::pin!(client);
|
|
||||||
|
|
||||||
// Init base reader
|
// Init base reader
|
||||||
let (start_lsn, base_tarfile_path) = base;
|
let (start_lsn, base_tarfile_path) = base;
|
||||||
let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
|
let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
|
||||||
let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
|
let base_tarfile =
|
||||||
|
mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
|
||||||
|
|
||||||
// Init wal reader if necessary
|
// Init wal reader if necessary
|
||||||
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
|
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
|
||||||
let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
|
let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
|
||||||
let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
|
let wal_reader =
|
||||||
|
mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
|
||||||
(end_lsn, Some(wal_reader))
|
(end_lsn, Some(wal_reader))
|
||||||
} else {
|
} else {
|
||||||
(start_lsn, None)
|
(start_lsn, None)
|
||||||
};
|
};
|
||||||
|
|
||||||
let copy_in = |reader, cmd| {
|
|
||||||
let client = &client;
|
|
||||||
async move {
|
|
||||||
let writer = client.copy_in(&cmd).await?;
|
|
||||||
let writer = std::pin::pin!(writer);
|
|
||||||
let mut writer = writer.sink_map_err(|e| {
|
|
||||||
std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
|
|
||||||
});
|
|
||||||
let mut reader = std::pin::pin!(reader);
|
|
||||||
writer.send_all(&mut reader).await?;
|
|
||||||
writer.into_inner().finish().await?;
|
|
||||||
anyhow::Ok(())
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Import base
|
// Import base
|
||||||
copy_in(
|
self.http_client
|
||||||
base_tarfile,
|
.import_basebackup(
|
||||||
format!(
|
tenant_id,
|
||||||
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
|
timeline_id,
|
||||||
),
|
start_lsn,
|
||||||
)
|
end_lsn,
|
||||||
.await?;
|
pg_version,
|
||||||
// Import wal if necessary
|
base_tarfile,
|
||||||
if let Some(wal_reader) = wal_reader {
|
|
||||||
copy_in(
|
|
||||||
wal_reader,
|
|
||||||
format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
|
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
// Import wal if necessary
|
||||||
|
if let Some(wal_reader) = wal_reader {
|
||||||
|
self.http_client
|
||||||
|
.import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
|
||||||
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -155,16 +155,16 @@ impl StorageController {
|
|||||||
.expect("non-Unicode path")
|
.expect("non-Unicode path")
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
/// Find the directory containing postgres subdirectories, such `bin` and `lib`
|
||||||
///
|
///
|
||||||
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
|
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
|
||||||
/// to other versions if that one isn't found. Some automated tests create circumstances
|
/// to other versions if that one isn't found. Some automated tests create circumstances
|
||||||
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
||||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
|
||||||
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
|
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
|
||||||
|
|
||||||
for v in prefer_versions {
|
for v in prefer_versions {
|
||||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
|
||||||
if tokio::fs::try_exists(&path).await? {
|
if tokio::fs::try_exists(&path).await? {
|
||||||
return Ok(path);
|
return Ok(path);
|
||||||
}
|
}
|
||||||
@@ -172,11 +172,20 @@ impl StorageController {
|
|||||||
|
|
||||||
// Fall through
|
// Fall through
|
||||||
anyhow::bail!(
|
anyhow::bail!(
|
||||||
"Postgres binaries not found in {}",
|
"Postgres directory '{}' not found in {}",
|
||||||
self.env.pg_distrib_dir.display()
|
dir_name,
|
||||||
|
self.env.pg_distrib_dir.display(),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||||
|
self.get_pg_dir("bin").await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||||
|
self.get_pg_dir("lib").await
|
||||||
|
}
|
||||||
|
|
||||||
/// Readiness check for our postgres process
|
/// Readiness check for our postgres process
|
||||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
||||||
let bin_path = pg_bin_dir.join("pg_isready");
|
let bin_path = pg_bin_dir.join("pg_isready");
|
||||||
@@ -229,12 +238,17 @@ impl StorageController {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.join("storage_controller_db");
|
.join("storage_controller_db");
|
||||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||||
|
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
||||||
let pg_log_path = pg_data_path.join("postgres.log");
|
let pg_log_path = pg_data_path.join("postgres.log");
|
||||||
|
|
||||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||||
// Initialize empty database
|
// Initialize empty database
|
||||||
let initdb_path = pg_bin_dir.join("initdb");
|
let initdb_path = pg_bin_dir.join("initdb");
|
||||||
let mut child = Command::new(&initdb_path)
|
let mut child = Command::new(&initdb_path)
|
||||||
|
.envs(vec![
|
||||||
|
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
|
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
|
])
|
||||||
.args(["-D", pg_data_path.as_ref()])
|
.args(["-D", pg_data_path.as_ref()])
|
||||||
.spawn()
|
.spawn()
|
||||||
.expect("Failed to spawn initdb");
|
.expect("Failed to spawn initdb");
|
||||||
@@ -269,7 +283,10 @@ impl StorageController {
|
|||||||
&self.env.base_data_dir,
|
&self.env.base_data_dir,
|
||||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||||
db_start_args,
|
db_start_args,
|
||||||
[],
|
vec![
|
||||||
|
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
|
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
|
],
|
||||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||||
retry_timeout,
|
retry_timeout,
|
||||||
|| self.pg_isready(&pg_bin_dir),
|
|| self.pg_isready(&pg_bin_dir),
|
||||||
@@ -324,7 +341,10 @@ impl StorageController {
|
|||||||
&self.env.base_data_dir,
|
&self.env.base_data_dir,
|
||||||
&self.env.storage_controller_bin(),
|
&self.env.storage_controller_bin(),
|
||||||
args,
|
args,
|
||||||
[],
|
vec![
|
||||||
|
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
|
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
|
],
|
||||||
background_process::InitialPidFile::Create(self.pid_file()),
|
background_process::InitialPidFile::Create(self.pid_file()),
|
||||||
retry_timeout,
|
retry_timeout,
|
||||||
|| async {
|
|| async {
|
||||||
|
|||||||
@@ -56,6 +56,10 @@ enum Command {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
scheduling: Option<NodeSchedulingPolicy>,
|
scheduling: Option<NodeSchedulingPolicy>,
|
||||||
},
|
},
|
||||||
|
NodeDelete {
|
||||||
|
#[arg(long)]
|
||||||
|
node_id: NodeId,
|
||||||
|
},
|
||||||
/// Modify a tenant's policies in the storage controller
|
/// Modify a tenant's policies in the storage controller
|
||||||
TenantPolicy {
|
TenantPolicy {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
@@ -337,7 +341,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
Command::TenantCreate { tenant_id } => {
|
Command::TenantCreate { tenant_id } => {
|
||||||
storcon_client
|
storcon_client
|
||||||
.dispatch(
|
.dispatch::<_, ()>(
|
||||||
Method::POST,
|
Method::POST,
|
||||||
"v1/tenant".to_string(),
|
"v1/tenant".to_string(),
|
||||||
Some(TenantCreateRequest {
|
Some(TenantCreateRequest {
|
||||||
@@ -357,13 +361,16 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
tracing::info!("Delete status: {}", status);
|
tracing::info!("Delete status: {}", status);
|
||||||
}
|
}
|
||||||
Command::Nodes {} => {
|
Command::Nodes {} => {
|
||||||
let resp = storcon_client
|
let mut resp = storcon_client
|
||||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||||
Method::GET,
|
Method::GET,
|
||||||
"control/v1/node".to_string(),
|
"control/v1/node".to_string(),
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
|
||||||
|
|
||||||
let mut table = comfy_table::Table::new();
|
let mut table = comfy_table::Table::new();
|
||||||
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
||||||
for node in resp {
|
for node in resp {
|
||||||
@@ -395,13 +402,16 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
Command::Tenants {} => {
|
Command::Tenants {} => {
|
||||||
let resp = storcon_client
|
let mut resp = storcon_client
|
||||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||||
Method::GET,
|
Method::GET,
|
||||||
"control/v1/tenant".to_string(),
|
"control/v1/tenant".to_string(),
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
|
||||||
|
|
||||||
let mut table = comfy_table::Table::new();
|
let mut table = comfy_table::Table::new();
|
||||||
table.set_header([
|
table.set_header([
|
||||||
"TenantId",
|
"TenantId",
|
||||||
@@ -650,6 +660,11 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
.dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
|
.dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
|
Command::NodeDelete { node_id } => {
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
Command::TenantSetTimeBasedEviction {
|
Command::TenantSetTimeBasedEviction {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
period,
|
period,
|
||||||
|
|||||||
345
docs/rfcs/033-storage-controller-drain-and-fill.md
Normal file
345
docs/rfcs/033-storage-controller-drain-and-fill.md
Normal file
@@ -0,0 +1,345 @@
|
|||||||
|
# Graceful Restarts of Storage Controller Managed Clusters
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes.
|
||||||
|
It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement
|
||||||
|
graceful cluster restarts.
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
Pageserver restarts cause read availablity downtime for tenants.
|
||||||
|
|
||||||
|
For example pageserver-3 @ us-east-1 was unavailable for a randomly
|
||||||
|
picked tenant (which requested on-demand activation) for around 30 seconds
|
||||||
|
during the restart at 2024-04-03 16:37 UTC.
|
||||||
|
|
||||||
|
Note that lots of shutdowns on loaded pageservers do not finish within the
|
||||||
|
[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||||
|
and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
|
||||||
|
|
||||||
|
This problem is not yet very acutely felt in storage controller managed pageservers since
|
||||||
|
tenant density is much lower there. However, we are planning on eventually migrating all
|
||||||
|
pageservers to storage controller management, so it makes sense to solve the issue proactively.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Pageserver re-deployments cause minimal downtime for tenants
|
||||||
|
- The storage controller exposes HTTP API hooks for draining and filling tenant shards
|
||||||
|
from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator.
|
||||||
|
- The storage controller exposes some HTTP API to cancel draining and filling background operations.
|
||||||
|
- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed
|
||||||
|
as usual (with downtime).
|
||||||
|
- Progress of draining/filling is visible through metrics
|
||||||
|
|
||||||
|
## Non Goals
|
||||||
|
|
||||||
|
- Integration with the control plane
|
||||||
|
- Graceful restarts for large non-HA tenants.
|
||||||
|
|
||||||
|
## Impacted Components
|
||||||
|
|
||||||
|
- storage controller
|
||||||
|
- deployment orchestrator (i.e. Ansible)
|
||||||
|
- pageserver (indirectly)
|
||||||
|
|
||||||
|
## Terminology
|
||||||
|
|
||||||
|
** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver
|
||||||
|
are distributed across the rest of the cluster.
|
||||||
|
|
||||||
|
** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given
|
||||||
|
pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers.
|
||||||
|
|
||||||
|
** Node scheduling policies ** act as constraints to the scheduler. For instance, when a
|
||||||
|
node is set in the `Paused` policy, no further shards will be scheduled on it.
|
||||||
|
|
||||||
|
** Node ** is a pageserver. Term is used interchangeably in this RFC.
|
||||||
|
|
||||||
|
** Deployment orchestrator ** is a generic term for whatever drives our deployments.
|
||||||
|
Currently, it's an Ansible playbook.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
### Storage Controller Basics (skip if already familiar)
|
||||||
|
|
||||||
|
Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers.
|
||||||
|
|
||||||
|
An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook.
|
||||||
|
|
||||||
|
### Background Optimizations
|
||||||
|
|
||||||
|
The storage controller performs scheduling optimizations in the background. It will
|
||||||
|
migrate attachments to warm secondaries and replace secondaries in order to balance
|
||||||
|
the cluster out.
|
||||||
|
|
||||||
|
### Reconciliations Concurrency Limiting
|
||||||
|
|
||||||
|
There's a hard limit on the number of reconciles that the storage controller
|
||||||
|
can have in flight at any given time. To get an idea of scales, the limit is
|
||||||
|
128 at the time of writing.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
Note: this section focuses on the core functionality of the graceful restart process.
|
||||||
|
It doesn't neccesarily describe the most efficient approach. Optimizations are described
|
||||||
|
separately in a later section.
|
||||||
|
|
||||||
|
### Overall Flow
|
||||||
|
|
||||||
|
This section describes how to implement graceful restarts from the perspective
|
||||||
|
of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially.
|
||||||
|
The orchestrator shall implement the following epilogue and prologue steps for each
|
||||||
|
pageserver restart:
|
||||||
|
|
||||||
|
#### Prologue
|
||||||
|
|
||||||
|
The orchestrator shall first fetch the pageserver node id from the control plane or
|
||||||
|
the pageserver it aims to restart directly. Next, it issues an HTTP request
|
||||||
|
to the storage controller in order to start the drain of said pageserver node.
|
||||||
|
All error responses are retried with a short back-off. When a 202 (Accepted)
|
||||||
|
HTTP code is returned, the drain has started. Now the orchestrator polls the
|
||||||
|
node status endpoint exposed by the storage controller in order to await the
|
||||||
|
end of the drain process. When the `policy` field of the node status response
|
||||||
|
becomes `PauseForRestart`, the drain has completed and the orchestrator can
|
||||||
|
proceed with restarting the pageserver.
|
||||||
|
|
||||||
|
The prologue is subject to an overall timeout. It will have a value in the ballpark
|
||||||
|
of minutes. As storage controller managed pageservers become more loaded this timeout
|
||||||
|
will likely have to increase.
|
||||||
|
|
||||||
|
#### Epilogue
|
||||||
|
|
||||||
|
After restarting the pageserver, the orchestrator issues an HTTP request
|
||||||
|
to the storage controller to kick off the filling process. This API call
|
||||||
|
may be retried for all error codes with a short backoff. This also serves
|
||||||
|
as a synchronization primitive as the fill will be refused if the pageserver
|
||||||
|
has not yet re-attached to the storage controller. When a 202(Accepted) HTTP
|
||||||
|
code is returned, the fill has started. Now the orchestrator polls the node
|
||||||
|
status endpoint exposed by the storage controller in order to await the end of
|
||||||
|
the filling process. When the `policy` field of the node status response becomes
|
||||||
|
`Active`, the fill has completed and the orchestrator may proceed to the next pageserver.
|
||||||
|
|
||||||
|
Again, the epilogue is subject to an overall timeout. We can start off with
|
||||||
|
using the same timeout as for the prologue, but can also consider relying on
|
||||||
|
the storage controller's background optimizations with a shorter timeout.
|
||||||
|
|
||||||
|
In the case that the deployment orchestrator times out, it attempts to cancel
|
||||||
|
the fill. This operation shall be retried with a short back-off. If it ultimately
|
||||||
|
fails it will require manual intervention to set the nodes scheduling policy to
|
||||||
|
`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic,
|
||||||
|
but it constrains the scheduler as mentioned previously.
|
||||||
|
|
||||||
|
### Node Scheduling Policy State Machine
|
||||||
|
|
||||||
|
The state machine below encodes the behaviours discussed above and
|
||||||
|
the various failover situations described in a later section.
|
||||||
|
|
||||||
|
Assuming no failures and/or timeouts the flow should be:
|
||||||
|
`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active`
|
||||||
|
|
||||||
|
```
|
||||||
|
Operator requested drain
|
||||||
|
+-----------------------------------------+
|
||||||
|
| |
|
||||||
|
+-------+-------+ +-------v-------+
|
||||||
|
| | | |
|
||||||
|
| Pause | +-----------> Draining +----------+
|
||||||
|
| | | | | |
|
||||||
|
+---------------+ | +-------+-------+ |
|
||||||
|
| | |
|
||||||
|
| | |
|
||||||
|
Drain requested| | |
|
||||||
|
| |Drain complete | Drain failed
|
||||||
|
| | | Cancelled/PS reattach/Storcon restart
|
||||||
|
| | |
|
||||||
|
+-------+-------+ | |
|
||||||
|
| | | |
|
||||||
|
+-------------+ Active <-----------+------------------+
|
||||||
|
| | | |
|
||||||
|
Fill requested | +---^---^-------+ |
|
||||||
|
| | | |
|
||||||
|
| | | |
|
||||||
|
| | | |
|
||||||
|
| Fill completed| | |
|
||||||
|
| | |PS reattach |
|
||||||
|
| | |after restart |
|
||||||
|
+-------v-------+ | | +-------v-------+
|
||||||
|
| | | | | |
|
||||||
|
| Filling +---------+ +-----------+PauseForRestart|
|
||||||
|
| | | |
|
||||||
|
+---------------+ +---------------+
|
||||||
|
```
|
||||||
|
|
||||||
|
### Draining/Filling APIs
|
||||||
|
|
||||||
|
The storage controller API to trigger the draining of a given node is:
|
||||||
|
`PUT /v1/control/node/:node_id/{drain,fill}`.
|
||||||
|
|
||||||
|
The following HTTP non-success return codes are used.
|
||||||
|
All of them are safely retriable from the perspective of the storage controller.
|
||||||
|
- 404: Requested node was not found
|
||||||
|
- 503: Requested node is known to the storage controller, but unavailable
|
||||||
|
- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining
|
||||||
|
- 409: A {drain, fill} is already in progress. Only one such background operation
|
||||||
|
is allowed per node.
|
||||||
|
|
||||||
|
When the drain is accepted and commenced a 202 HTTP code is returned.
|
||||||
|
|
||||||
|
Drains and fills shall be cancellable by the deployment orchestrator or a
|
||||||
|
human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200
|
||||||
|
response is returned when the cancelation is successful. Errors are retriable.
|
||||||
|
|
||||||
|
### Drain Process
|
||||||
|
|
||||||
|
Before accpeting a drain request the following validations is applied:
|
||||||
|
* Ensure that the node is known the storage controller
|
||||||
|
* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause`
|
||||||
|
* Ensure that another drain or fill is not already running on the node
|
||||||
|
* Ensure that a drain is possible (i.e. check that there is at least one
|
||||||
|
schedulable node to drain to)
|
||||||
|
|
||||||
|
After accepting the drain, the scheduling policy of the node is set to
|
||||||
|
`NodeSchedulingPolicy::Draining` and persisted in both memory and the database.
|
||||||
|
This disallows the optimizer from adding or removing shards from the node which
|
||||||
|
is desirable to avoid them racing.
|
||||||
|
|
||||||
|
Next, a separate Tokio task is spawned to manage the draining. For each tenant
|
||||||
|
shard attached to the node being drained, demote the node to a secondary and
|
||||||
|
attempt to schedule the node away. Scheduling might fail due to unsatisfiable
|
||||||
|
constraints, but that is fine. Draining is a best effort process since it might
|
||||||
|
not always be possible to cut over all shards.
|
||||||
|
|
||||||
|
Importantly, this task manages the concurrency of issued reconciles in order to
|
||||||
|
avoid drowning out the target pageservers and to allow other important reconciles
|
||||||
|
to proceed.
|
||||||
|
|
||||||
|
Once the triggered reconciles have finished or timed out, set the node's scheduling
|
||||||
|
policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain.
|
||||||
|
|
||||||
|
A note on non HA tenants: These tenants do not have secondaries, so by the description
|
||||||
|
above, they would not be migrated. It makes sense to skip them (especially the large ones)
|
||||||
|
since, depending on tenant size, this might be more disruptive than the restart since the
|
||||||
|
pageserver we've moved to do will need to on-demand download the entire working set for the tenant.
|
||||||
|
We can consider expanding to small non-HA tenants in the future.
|
||||||
|
|
||||||
|
### Fill Process
|
||||||
|
|
||||||
|
Before accpeting a fill request the following validations is applied:
|
||||||
|
* Ensure that the node is known the storage controller
|
||||||
|
* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`.
|
||||||
|
This is the only acceptable policy for the fill starting state. When a node re-attaches,
|
||||||
|
it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to
|
||||||
|
`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain).
|
||||||
|
* Ensure that another drain or fill is not already running on the node
|
||||||
|
|
||||||
|
After accepting the drain, the scheduling policy of the node is set to
|
||||||
|
`NodeSchedulingPolicy::Filling` and persisted in both memory and the database.
|
||||||
|
This disallows the optimizer from adding or removing shards from the node which
|
||||||
|
is desirable to avoid them racing.
|
||||||
|
|
||||||
|
Next, a separate Tokio task is spawned to manage the draining. For each tenant
|
||||||
|
shard where the filled node is a secondary, promote the secondary. This is done
|
||||||
|
until we run out of shards or the counts of attached shards become balanced across
|
||||||
|
the cluster.
|
||||||
|
|
||||||
|
Like for draining, the concurrency of spawned reconciles is limited.
|
||||||
|
|
||||||
|
### Failure Modes & Handling
|
||||||
|
|
||||||
|
Failures are generally handled by transition back into the `Active`
|
||||||
|
(neutral) state. This simplifies the implementation greatly at the
|
||||||
|
cost of adding transitions to the state machine. For example, we
|
||||||
|
could detect the `Draining` state upon restart and proceed with a drain,
|
||||||
|
but how should the storage controller know that's what the orchestrator
|
||||||
|
needs still?
|
||||||
|
|
||||||
|
#### Storage Controller Crash
|
||||||
|
|
||||||
|
When the storage controller starts up reset the node scheduling policy
|
||||||
|
of all nodes in states `Draining`, `Filling` or `PauseForRestart` to
|
||||||
|
`Active`. The rationale is that when the storage controller restarts,
|
||||||
|
we have lost context of what the deployment orchestrator wants. It also
|
||||||
|
has the benefit of making things easier to reason about.
|
||||||
|
|
||||||
|
#### Pageserver Crash During Drain
|
||||||
|
|
||||||
|
The pageserver will attempt to re-attach during restart at which
|
||||||
|
point the node scheduling policy will be set back to `Active`, thus
|
||||||
|
reenabling the scheduler to use the node.
|
||||||
|
|
||||||
|
#### Non-drained Pageserver Crash During Drain
|
||||||
|
|
||||||
|
What should happen when a pageserver we are draining to crashes during the
|
||||||
|
process. Two reasonable options are: cancel the drain and focus on the failover
|
||||||
|
*or* do both, but prioritise failover. Since the number of concurrent reconciles
|
||||||
|
produced by drains/fills are limited, we get the later behaviour for free.
|
||||||
|
My suggestion is we take this approach, but the cancellation option is trivial
|
||||||
|
to implement as well.
|
||||||
|
|
||||||
|
#### Pageserver Crash During Fill
|
||||||
|
|
||||||
|
The pageserver will attempt to re-attach during restart at which
|
||||||
|
point the node scheduling policy will be set back to `Active`, thus
|
||||||
|
reenabling the scheduler to use the node.
|
||||||
|
|
||||||
|
#### Pageserver Goes unavailable During Drain/Fill
|
||||||
|
|
||||||
|
The drain and fill jobs handle this by stopping early. When the pageserver
|
||||||
|
is detected as online by storage controller heartbeats, reset its scheduling
|
||||||
|
policy to `Active`. If a restart happens instead, see the pageserver crash
|
||||||
|
failure mode.
|
||||||
|
|
||||||
|
#### Orchestrator Drain Times Out
|
||||||
|
|
||||||
|
Orchestrator will still proceed with the restart.
|
||||||
|
When the pageserver re-attaches, the scheduling policy is set back to
|
||||||
|
`Active`.
|
||||||
|
|
||||||
|
#### Orchestrator Fill Times Out
|
||||||
|
|
||||||
|
Orchestrator will attempt to cancel the fill operation. If that fails,
|
||||||
|
the fill will continue until it quiesces and the node will be left
|
||||||
|
in the `Filling` scheduling policy. This hinders the scheduler, but is
|
||||||
|
otherwise harmless. A human operator can handle this by setting the scheduling
|
||||||
|
policy to `Active`, or we can bake in a fill timeout into the storage controller.
|
||||||
|
|
||||||
|
## Optimizations
|
||||||
|
|
||||||
|
### Location Warmth
|
||||||
|
|
||||||
|
When cutting over to a secondary, the storage controller will wait for it to
|
||||||
|
become "warm" (i.e. download enough of the tenants data). This means that some
|
||||||
|
reconciliations can take significantly longer than others and hold up precious
|
||||||
|
reconciliations units. As an optimization, the drain stage can only cut over
|
||||||
|
tenants that are already "warm". Similarly, the fill stage can prioritise the
|
||||||
|
"warmest" tenants in the fill.
|
||||||
|
|
||||||
|
Given that the number of tenants by the storage controller will be fairly low
|
||||||
|
for the foreseable future, the first implementation could simply query the tenants
|
||||||
|
for secondary status. This doesn't scale well with increasing tenant counts, so
|
||||||
|
eventually we will need new pageserver API endpoints to report the sets of
|
||||||
|
"warm" and "cold" nodes.
|
||||||
|
|
||||||
|
## Alternatives Considered
|
||||||
|
|
||||||
|
### Draining and Filling Purely as Scheduling Constraints
|
||||||
|
|
||||||
|
At its core, the storage controller is a big background loop that detects changes
|
||||||
|
in the environment and reacts on them. One could express draining and filling
|
||||||
|
of nodes purely in terms of constraining the scheduler (as opposed to having
|
||||||
|
such background tasks).
|
||||||
|
|
||||||
|
While theoretically nice, I think that's harder to implement and more importantly operate and reason about.
|
||||||
|
Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create
|
||||||
|
an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish
|
||||||
|
to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong
|
||||||
|
to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion.
|
||||||
|
|
||||||
|
It would also mean that reconciliations themselves have side effects that persist in the database
|
||||||
|
(persist something to the databse when the drain is done), which I'm not conceptually fond of.
|
||||||
|
|
||||||
|
## Proof of Concept
|
||||||
|
|
||||||
|
This RFC is accompanied by a POC which implements nearly everything mentioned here
|
||||||
|
apart from the optimizations and some of the failure handling:
|
||||||
|
https://github.com/neondatabase/neon/pull/7682
|
||||||
507
docs/rfcs/034-timeline-archive.md
Normal file
507
docs/rfcs/034-timeline-archive.md
Normal file
@@ -0,0 +1,507 @@
|
|||||||
|
# Timeline Archival
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
This RFC describes a mechanism for pageservers to eliminate local storage + compute work
|
||||||
|
for timelines which are not in use, in response to external API calls to "archive" a timeline.
|
||||||
|
|
||||||
|
The archived state roughly corresponds to fully offloading a timeline to object storage, such
|
||||||
|
that its cost is purely the cost of that object storage.
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
Archived timelines serve multiple purposes:
|
||||||
|
- Act as a 'snapshot' for workloads that would like to retain restorable copies of their
|
||||||
|
database from longer ago than their PITR window.
|
||||||
|
- Enable users to create huge numbers of branches (e.g. one per github PR) without having
|
||||||
|
to diligently clean them up later to avoid overloading the pageserver (currently we support
|
||||||
|
up to ~500 branches per tenant).
|
||||||
|
|
||||||
|
### Prior art
|
||||||
|
|
||||||
|
Most storage and database systems have some form of snapshot, which can be implemented several ways:
|
||||||
|
1. full copies of data (e.g. an EBS snapshot to S3)
|
||||||
|
2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
|
||||||
|
3. a series of snapshots which are CoW or de-duplicated relative to one another.
|
||||||
|
|
||||||
|
Today's Neon branches are approximately like `2.`, although due to implementation details branches
|
||||||
|
often end up storing much more data than they really need, as parent branches assume that all data
|
||||||
|
at the branch point is needed. The layers pinned in the parent branch may have a much larger size
|
||||||
|
than the physical size of a compressed image layer representing the data at the branch point.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Enter & exit the archived state in response to external admin API calls
|
||||||
|
- API calls to modify the archived state are atomic and durable
|
||||||
|
- An archived timeline should eventually (once out of PITR window) use an efficient compressed
|
||||||
|
representation, and avoid retaining arbitrarily large data in its parent branch.
|
||||||
|
- Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
|
||||||
|
but must not scale with the number of _archived_ branches.
|
||||||
|
- Background I/O for archived branches should only be done a limited number of times to evolve them
|
||||||
|
to a long-term-efficient state (e.g. rewriting to image layers). There should be no ongoing "housekeeping"
|
||||||
|
overhead for archived branches, including operations related to calculating sizes for billing.
|
||||||
|
- The pageserver should put no load on the safekeeper for archived branches.
|
||||||
|
- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
|
||||||
|
to a performant state in a short time (linear with the branch's logical size)
|
||||||
|
|
||||||
|
## Non Goals
|
||||||
|
|
||||||
|
- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
|
||||||
|
in Neon's internal format.
|
||||||
|
- Compute cold starts after activating an archived branch will not have comparable performance to
|
||||||
|
cold starts on an active branch.
|
||||||
|
- Archived branches will not use any new/additional compression or de-duplication beyond what
|
||||||
|
is already implemented for image layers (zstd per page).
|
||||||
|
- The pageserver will not "auto start" archived branches in response to page_service API requests: they
|
||||||
|
are only activated explicitly via the HTTP API.
|
||||||
|
- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
|
||||||
|
remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
|
||||||
|
- We will not expose any prometheus metrics for archived timelines, or make them visible in any
|
||||||
|
detailed HTTP APIs other than the specific API for listing archived timelines.
|
||||||
|
- A parent branch may not be archived unless all its children are.
|
||||||
|
|
||||||
|
## Impacted Components
|
||||||
|
|
||||||
|
pageserver, storage controller
|
||||||
|
|
||||||
|
## Terminology
|
||||||
|
|
||||||
|
**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
|
||||||
|
may assume that this branch is now very cheap to store, although this may not be physically so until the
|
||||||
|
branch proceeds to the offloaded state.
|
||||||
|
|
||||||
|
**Active** branches are branches which are available for use by page_service clients, and have a relatively
|
||||||
|
high cost due to consuming local storage.
|
||||||
|
|
||||||
|
**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
|
||||||
|
that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
|
||||||
|
|
||||||
|
**Activate** (verb): transition from Archived to Active
|
||||||
|
|
||||||
|
**Archive** (verb): transition from Active to Archived
|
||||||
|
|
||||||
|
**Offload** (verb): transition from Archived to Offloaded
|
||||||
|
|
||||||
|
**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
|
||||||
|
|
||||||
|
**Warm up** (verb): operation done on an active branch, by downloading its active layers. Once a branch is
|
||||||
|
warmed up, good performance will be available to page_service clients.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### High level flow
|
||||||
|
|
||||||
|
We may think of a timeline which is archived and then activated as proceeding through a series of states:
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
stateDiagram
|
||||||
|
[*] --> Active(warm)
|
||||||
|
Active(warm) --> Archived
|
||||||
|
Archived --> Offloaded
|
||||||
|
Archived --> Active(warm)
|
||||||
|
Offloaded --> Active(cold)
|
||||||
|
Active(cold) --> Active(warm)
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
|
||||||
|
of branches will be:
|
||||||
|
- Very frequent: Short lived branches: Active -> Deleted
|
||||||
|
- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
|
||||||
|
- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
|
||||||
|
|
||||||
|
These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
|
||||||
|
of:
|
||||||
|
- the timeline's lifecycle state: active or archived, stored in the timeline's index
|
||||||
|
- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
|
||||||
|
manifest of offloaded timelines.
|
||||||
|
- cache state (whether it's warm or cold).
|
||||||
|
|
||||||
|
### Storage format changes
|
||||||
|
|
||||||
|
There are two storage format changes:
|
||||||
|
1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
|
||||||
|
be considered active or archived.
|
||||||
|
2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
|
||||||
|
at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
|
||||||
|
|
||||||
|
The manifest object will have a format like this:
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"offload_timelines": [
|
||||||
|
{
|
||||||
|
"timeline_id": ...
|
||||||
|
"last_record_lsn": ...
|
||||||
|
"last_record_lsn_time": ...
|
||||||
|
"pitr_interval": ...
|
||||||
|
"last_gc_lsn": ... # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
|
||||||
|
"logical_size": ... # The size at last_record_lsn
|
||||||
|
"physical_size" ...
|
||||||
|
"parent": Option<{
|
||||||
|
"timeline_id"...
|
||||||
|
"lsn"... # Branch point LSN on the parent
|
||||||
|
"requires_data": bool # True if this branch depends on layers in its parent, identify it here
|
||||||
|
|
||||||
|
}>
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The information about a timeline in its offload state is intentionally minimal: just enough to decide:
|
||||||
|
- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
|
||||||
|
by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
|
||||||
|
- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
|
||||||
|
layers that the archived branch depends on
|
||||||
|
- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
|
||||||
|
is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
|
||||||
|
we don't need to go to S3 for the deletion.
|
||||||
|
- How much archived space to report in consumption metrics
|
||||||
|
|
||||||
|
The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
|
||||||
|
set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
|
||||||
|
(offloaded timelines).
|
||||||
|
|
||||||
|
For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
|
||||||
|
index_part objects are (see [generation numbers RFC](025-generation-numbers.md)). This will add some complexity, but
|
||||||
|
give us total safety against two pageservers with the same tenant attached fighting over the object. Existing code
|
||||||
|
for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
|
||||||
|
the manifest file.
|
||||||
|
|
||||||
|
### API & Timeline state
|
||||||
|
|
||||||
|
Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart. This will
|
||||||
|
be controlled by a new per-timeline `configure` endpoint. This is intentionally generic naming, which
|
||||||
|
may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
|
||||||
|
a per-timeline configuration).
|
||||||
|
|
||||||
|
`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
|
||||||
|
```
|
||||||
|
{
|
||||||
|
'state': 'active|archive'
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
|
||||||
|
|
||||||
|
When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
|
||||||
|
**and** the `Timeline` object has been instantiated and activated. This will require reading the timeline's
|
||||||
|
index, but not any data: it should be about as fast as a couple of small S3 requests.
|
||||||
|
|
||||||
|
The API will be available with identical path via the storage controller: calling this on a sharded tenant
|
||||||
|
will simply map the API call to all the shards.
|
||||||
|
|
||||||
|
Archived timelines may never have descendent timelines which are active. This will be enforced at the API level,
|
||||||
|
such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
|
||||||
|
that all its descendents are archived. It is the callers responsibility to walk the hierarchy of timelines
|
||||||
|
in the proper order if they would like to archive whole trees of branches.
|
||||||
|
|
||||||
|
Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
|
||||||
|
for archived timelines will be added: this is for use in support/debug:
|
||||||
|
|
||||||
|
```
|
||||||
|
GET /v1/tenants/{tenant_id}/archived_timelines
|
||||||
|
|
||||||
|
{
|
||||||
|
...same per-timeline content as the tenant manifest...
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tenant attach changes
|
||||||
|
|
||||||
|
Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
|
||||||
|
we load their index_part.json. To avoid the number of GETs scaling linearly with the number of archived
|
||||||
|
timelines, we must have a single object that tells us which timelines do not need to be loaded. The
|
||||||
|
number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
|
||||||
|
because each request covers 1000 timelines.
|
||||||
|
|
||||||
|
This is **not** literally the same as the set of timelines who have state=archived. Rather, it is
|
||||||
|
the set of timelines which have been offloaded in the background after their state was set to archived.
|
||||||
|
|
||||||
|
We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
|
||||||
|
exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
|
||||||
|
to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
|
||||||
|
to delete an offloaded timeline.
|
||||||
|
|
||||||
|
### Warm-up API
|
||||||
|
|
||||||
|
`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
|
||||||
|
|
||||||
|
This API will be similar to the existing `download_remote_layers` API, but smarter:
|
||||||
|
- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
|
||||||
|
- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
|
||||||
|
of downloads, so that the caller can poll.
|
||||||
|
|
||||||
|
The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
|
||||||
|
of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
|
||||||
|
can possibly be read from these LSNs. This concept of layer visibility is more generally useful for cache
|
||||||
|
eviction and heatmaps, as well as in this specific case of warming up a timeline.
|
||||||
|
|
||||||
|
The caller does not have to wait for the warm up API, or call it at all. But it is strongly advised
|
||||||
|
to call it, because otherwise populating local contents for a timeline can take a long time when waiting
|
||||||
|
for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
|
||||||
|
volatile.
|
||||||
|
|
||||||
|
### Background work
|
||||||
|
|
||||||
|
Archived branches are not subject to normal compaction. Instead, when the compaction loop encounters
|
||||||
|
an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
|
||||||
|
([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
|
||||||
|
if its state permits that.
|
||||||
|
|
||||||
|
Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
|
||||||
|
optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
|
||||||
|
has elapsed and it can now be rewritten to image layers.
|
||||||
|
|
||||||
|
#### Archive branch offload
|
||||||
|
|
||||||
|
Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
|
||||||
|
any actual work.
|
||||||
|
|
||||||
|
This work is done in the background compaction loop. It makes sense to tag this work on to the compaction
|
||||||
|
loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
|
||||||
|
|
||||||
|
The condition for offload is simple:
|
||||||
|
- a `Timeline` object exists with state `Archived`
|
||||||
|
- the timeline does not have any non-offloaded children.
|
||||||
|
|
||||||
|
Regarding the condition that children must be offloaded, this will always be eventually true, because
|
||||||
|
we enforce at the API level that children of archived timelines must themselves be archived, and all
|
||||||
|
archived timelines will eventually be offloaded.
|
||||||
|
|
||||||
|
Offloading a timeline is simple:
|
||||||
|
- Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
|
||||||
|
- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
|
||||||
|
- Erase all the timeline's content from local storage (`remove_dir_all` on its path)
|
||||||
|
- Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
|
||||||
|
|
||||||
|
#### Archive branch optimization (flattening)
|
||||||
|
|
||||||
|
When we offloaded a branch, it might have had some history that prevented rewriting it to a single
|
||||||
|
point in time set of image layers. For example, a branch might have several days of writes and a 7
|
||||||
|
day PITR: when we archive it, it still has those days of history.
|
||||||
|
|
||||||
|
Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
|
||||||
|
- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
|
||||||
|
a point in time compared with delta layers
|
||||||
|
- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
|
||||||
|
for data, i.e. the ancestor is free to GC layers files at+below the branch point
|
||||||
|
|
||||||
|
Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
|
||||||
|
branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
|
||||||
|
a true snapshot at that LSN.
|
||||||
|
|
||||||
|
It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
|
||||||
|
is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
|
||||||
|
|
||||||
|
Archive branch optimization should be done _before_ background offloads during compaction, because there may
|
||||||
|
be timelines which are ready to be offloaded but also would benefit from the optimization step before
|
||||||
|
being offloaded. For example, a branch which has already fallen out of PITR window and has no history
|
||||||
|
of its own may be immediately re-written as a series of image layers before being offloaded.
|
||||||
|
|
||||||
|
### Consumption metrics
|
||||||
|
|
||||||
|
Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
|
||||||
|
that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
|
||||||
|
vs. ordinary content.
|
||||||
|
|
||||||
|
Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
|
||||||
|
variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
|
||||||
|
|
||||||
|
### Secondary locations
|
||||||
|
|
||||||
|
Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
|
||||||
|
when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
|
||||||
|
will be dropped from secondary locations.
|
||||||
|
|
||||||
|
### Sharding
|
||||||
|
|
||||||
|
Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
|
||||||
|
the same way that timeline creation and deletion is done. There are no special rules about ordering:
|
||||||
|
the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
|
||||||
|
|
||||||
|
Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
|
||||||
|
will be authoritative for consumption metrics.
|
||||||
|
|
||||||
|
## Error cases
|
||||||
|
|
||||||
|
### Errors in sharded tenants
|
||||||
|
|
||||||
|
If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
|
||||||
|
state, where a timeline is archived on some shards but not on others.
|
||||||
|
|
||||||
|
We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
|
||||||
|
are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
|
||||||
|
In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
|
||||||
|
up their mistake. We rely on this good behavior of callers to eventually get timelines into a consistent
|
||||||
|
state across all shards. If callers do leave a timeline in an inconsistent state across shards, this doesn't
|
||||||
|
break anything, it's just "weird".
|
||||||
|
|
||||||
|
This is similar to the status quo for timeline creation and deletion: callers are expected to retry
|
||||||
|
these operations until they succeed.
|
||||||
|
|
||||||
|
### Archiving/activating
|
||||||
|
|
||||||
|
Archiving/activating a timeline can fail in a limited number of ways:
|
||||||
|
1. I/O error storing/reading the timeline's updated index
|
||||||
|
- These errors are always retryable: a fundamental design assumption of the pageserver is that remote
|
||||||
|
storage errors are always transient.
|
||||||
|
2. NotFound if the timeline doesn't exist
|
||||||
|
- Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
|
||||||
|
- The storage controller has runtime locking to prevent races such as deleting a timeline while
|
||||||
|
archiving it.
|
||||||
|
3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
|
||||||
|
- Callers are expected to do their own checks to avoid hitting this case. If they make
|
||||||
|
a mistake and encounter this error, they should give up.
|
||||||
|
|
||||||
|
### Offloading
|
||||||
|
|
||||||
|
Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
|
||||||
|
tenant manifest. In such error cases, we give up in the expectation that offloading will be tried
|
||||||
|
again at the next iteration of the compaction loop.
|
||||||
|
|
||||||
|
### Archive branch optimization
|
||||||
|
|
||||||
|
Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
|
||||||
|
can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
|
||||||
|
the next iteration of the compaction loop.
|
||||||
|
|
||||||
|
## Optimizations
|
||||||
|
|
||||||
|
### Delaying storage optimization if retaining parent layers is cheaper
|
||||||
|
|
||||||
|
Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
|
||||||
|
is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
|
||||||
|
are offloaded to S3 they're totally safe, inert things.
|
||||||
|
|
||||||
|
However, in some cases it can be advantageous to retain extra history on their parent branch rather
|
||||||
|
than flattening the archived branch. For example, if a 1TB parent branch is rather slow-changing (1GB
|
||||||
|
of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
|
||||||
|
for each nightly branch is inefficient compared with just keeping more history on the main branch.
|
||||||
|
|
||||||
|
Getting this right requires consideration of:
|
||||||
|
- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
|
||||||
|
write out extra image layers, then it might make more sense to just write out the image layers on
|
||||||
|
the archived branch.
|
||||||
|
- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
|
||||||
|
the layer map (and index_part) bigger. There are practical limits beyond which writing an indefinitely
|
||||||
|
large layer map can cause problems elsewhere.
|
||||||
|
|
||||||
|
This optimization can probably be implemented quite cheaply with some basic heuristics like:
|
||||||
|
- don't bother doing optimization on an archive branch if the LSN distance between
|
||||||
|
its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
|
||||||
|
- ...but, Don't keep more history on the main branch than double the PITR
|
||||||
|
|
||||||
|
### Creating a timeline in archived state (a snapshot)
|
||||||
|
|
||||||
|
Sometimes, one might want to create a branch with no history, which will not be written to
|
||||||
|
before it is archived. This is a snapshot, although we do not require a special snapshot API,
|
||||||
|
since a snapshot can be represented as a timeline with no history.
|
||||||
|
|
||||||
|
This can be accomplished by simply creating a timeline and then immediately archiving it, but
|
||||||
|
that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
|
||||||
|
broker to try and ingest WAL, before being shutdown in the subsequent archival call. To explicitly
|
||||||
|
support this common special case, we may add a parameter to the timeline creation API which
|
||||||
|
creates a timeline directly into the archived state.
|
||||||
|
|
||||||
|
Such a timeline creation will do exactly two I/Os at creation time:
|
||||||
|
- write the index_part object to record the timeline's existence
|
||||||
|
- when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
|
||||||
|
write the tenant manifest.
|
||||||
|
|
||||||
|
Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
|
||||||
|
up the 'snapshot' branch and write out image layers.
|
||||||
|
|
||||||
|
## Future Work
|
||||||
|
|
||||||
|
### Enabling `fullbackup` dumps from archive branches
|
||||||
|
|
||||||
|
It would be useful to be able to export an archive branch to another system, or for use in a local
|
||||||
|
postgres database.
|
||||||
|
|
||||||
|
This could be implemented as a general capability for all branches, in which case it would "just work"
|
||||||
|
for archive branches by activating them. However, downloading all the layers in a branch just to generate
|
||||||
|
a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
|
||||||
|
which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
|
||||||
|
|
||||||
|
Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
|
||||||
|
is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup
|
||||||
|
stream to S3 in an intermediate format and, then having one node stitch them together).
|
||||||
|
|
||||||
|
### Tagging layers from archived branches
|
||||||
|
|
||||||
|
When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
|
||||||
|
we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
|
||||||
|
cheaper storage.
|
||||||
|
|
||||||
|
This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
|
||||||
|
external hints on which branches are likely to be reactivated, and which branches are good candidates for
|
||||||
|
tagging for low performance storage.
|
||||||
|
|
||||||
|
Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes. Other clouds' object
|
||||||
|
stores have similar mechanisms.
|
||||||
|
|
||||||
|
### Storing sequences of archive branches as deltas
|
||||||
|
|
||||||
|
When archived branches are used as scheduled snapshots, we could store them even more efficiently
|
||||||
|
by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
|
||||||
|
storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
|
||||||
|
pages). This is the kind of encoding that many backup storage systems use.
|
||||||
|
|
||||||
|
The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
|
||||||
|
vs. just writing out a simple stream of the entire database. For smaller databases, writing out a full
|
||||||
|
copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
|
||||||
|
so the complexity tradeoff of diff-encoding it is dubious).
|
||||||
|
|
||||||
|
One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
|
||||||
|
pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
|
||||||
|
we can say: "A branch exists from Monday night. I have Monday night's data still active in the main branch,
|
||||||
|
so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
|
||||||
|
delta snapshot".
|
||||||
|
|
||||||
|
Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
|
||||||
|
each other: perhaps this would be done by making the archive branches have child/parent relationships with
|
||||||
|
each other, or perhaps we would permit them to remain children of their original parent, but additionally
|
||||||
|
have a relationship with the snapshot they're encoded relative to.
|
||||||
|
|
||||||
|
Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
|
||||||
|
out how frequently to write a full copy is important. This is essentially a zoomed-out version of what
|
||||||
|
we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
|
||||||
|
|
||||||
|
|
||||||
|
## FAQ/Alternatives
|
||||||
|
|
||||||
|
### Store all timelines in the tenant manifest
|
||||||
|
|
||||||
|
Rather than special-casing offloaded timelines in the offload manifest, we could store a total
|
||||||
|
manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
|
||||||
|
startup.
|
||||||
|
|
||||||
|
That would be a more invasive change (require hooking in to timeline creation), and would
|
||||||
|
generate much more I/O to this manifest for tenants that had many branches _and_ frequent
|
||||||
|
create/delete cycles for short lived branches. Restricting the manifest to offloaded timelines
|
||||||
|
means that we only have to cope with the rate at which long-lived timelines are archived, rather
|
||||||
|
than the rate at which sort lived timelines are created & destroyed.
|
||||||
|
|
||||||
|
### Automatically archiving/activating timelines without external API calls
|
||||||
|
|
||||||
|
We could implement TTL driven offload of timelines, waking them up when a page request
|
||||||
|
arrives.
|
||||||
|
|
||||||
|
This has downsides:
|
||||||
|
- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
|
||||||
|
know which of their branches are in this state, and might get a surprise when they try
|
||||||
|
to use such a branch.
|
||||||
|
- Price fluctuation: if the archival of a branch is used in end user pricing, then users
|
||||||
|
prefer clarity & consistency. Ideally a branch's storage should cost the same from the moment it
|
||||||
|
is created, rather than having a usage-dependency storage price.
|
||||||
|
- Complexity: enabling the page service to call up into the Tenant to activate a timeline
|
||||||
|
would be awkward, compared with an external entry point.
|
||||||
|
|
||||||
|
### Make offloaded a state of Timeline
|
||||||
|
|
||||||
|
To reduce the operator-facing complexity of having some timelines APIs that only return
|
||||||
|
non-offloaded timelines, we could build the offloaded state into the Timeline type.
|
||||||
|
|
||||||
|
`timeline.rs` is already one of the most egregiously long source files in the tree, so
|
||||||
|
this is rejected on the basis that we need to avoid making that complexity worse.
|
||||||
@@ -13,11 +13,7 @@ use std::{
|
|||||||
|
|
||||||
use measured::{
|
use measured::{
|
||||||
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
|
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
|
||||||
metric::{
|
metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
|
||||||
group::{Encoding, MetricValue},
|
|
||||||
name::MetricNameEncoder,
|
|
||||||
Metric, MetricType, MetricVec,
|
|
||||||
},
|
|
||||||
text::TextEncoder,
|
text::TextEncoder,
|
||||||
LabelGroup,
|
LabelGroup,
|
||||||
};
|
};
|
||||||
@@ -144,6 +140,7 @@ impl<const N: usize> HyperLogLogState<N> {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
|
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
|
||||||
for HyperLogLogState<N>
|
for HyperLogLogState<N>
|
||||||
{
|
{
|
||||||
@@ -182,12 +179,13 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.try_for_each(|(hll_shard, val)| {
|
.try_for_each(|(hll_shard, val)| {
|
||||||
enc.write_metric_value(
|
CounterState::new(val as u64).collect_into(
|
||||||
name.by_ref(),
|
&(),
|
||||||
labels.by_ref().compose_with(HllShardLabel {
|
labels.by_ref().compose_with(HllShardLabel {
|
||||||
hll_shard: hll_shard as i64,
|
hll_shard: hll_shard as i64,
|
||||||
}),
|
}),
|
||||||
MetricValue::Int(val as i64),
|
name.by_ref(),
|
||||||
|
enc,
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ use measured::{
|
|||||||
metric::{
|
metric::{
|
||||||
counter::CounterState,
|
counter::CounterState,
|
||||||
gauge::GaugeState,
|
gauge::GaugeState,
|
||||||
group::{Encoding, MetricValue},
|
group::Encoding,
|
||||||
name::{MetricName, MetricNameEncoder},
|
name::{MetricName, MetricNameEncoder},
|
||||||
MetricEncoding, MetricFamilyEncoding,
|
MetricEncoding, MetricFamilyEncoding,
|
||||||
},
|
},
|
||||||
@@ -171,8 +171,11 @@ fn write_gauge<Enc: Encoding>(
|
|||||||
labels: impl LabelGroup,
|
labels: impl LabelGroup,
|
||||||
name: impl MetricNameEncoder,
|
name: impl MetricNameEncoder,
|
||||||
enc: &mut Enc,
|
enc: &mut Enc,
|
||||||
) -> Result<(), Enc::Err> {
|
) -> Result<(), Enc::Err>
|
||||||
enc.write_metric_value(name, labels, MetricValue::Int(x))
|
where
|
||||||
|
GaugeState: MetricEncoding<Enc>,
|
||||||
|
{
|
||||||
|
GaugeState::new(x).collect_into(&(), labels, name, enc)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@@ -544,15 +547,6 @@ impl<T: Encoding> Encoding for Inc<T> {
|
|||||||
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
||||||
self.0.write_help(name, help)
|
self.0.write_help(name, help)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_metric_value(
|
|
||||||
&mut self,
|
|
||||||
name: impl MetricNameEncoder,
|
|
||||||
labels: impl LabelGroup,
|
|
||||||
value: MetricValue,
|
|
||||||
) -> Result<(), Self::Err> {
|
|
||||||
self.0.write_metric_value(name, labels, value)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
|
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
|
||||||
@@ -579,15 +573,6 @@ impl<T: Encoding> Encoding for Dec<T> {
|
|||||||
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
||||||
self.0.write_help(name, help)
|
self.0.write_help(name, help)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_metric_value(
|
|
||||||
&mut self,
|
|
||||||
name: impl MetricNameEncoder,
|
|
||||||
labels: impl LabelGroup,
|
|
||||||
value: MetricValue,
|
|
||||||
) -> Result<(), Self::Err> {
|
|
||||||
self.0.write_metric_value(name, labels, value)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Write the dec counter to the encoder
|
/// Write the dec counter to the encoder
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18;
|
|||||||
/// See [`Key::to_i128`] for more information on the encoding.
|
/// See [`Key::to_i128`] for more information on the encoding.
|
||||||
pub const METADATA_KEY_SIZE: usize = 16;
|
pub const METADATA_KEY_SIZE: usize = 16;
|
||||||
|
|
||||||
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
|
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key.
|
||||||
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
|
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
|
||||||
pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
|
pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,16 @@ pub struct KeySpace {
|
|||||||
pub ranges: Vec<Range<Key>>,
|
pub ranges: Vec<Range<Key>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for KeySpace {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "[")?;
|
||||||
|
for range in &self.ranges {
|
||||||
|
write!(f, "{}..{},", range.start, range.end)?;
|
||||||
|
}
|
||||||
|
write!(f, "]")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// A wrapper type for sparse keyspaces.
|
/// A wrapper type for sparse keyspaces.
|
||||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||||
pub struct SparseKeySpace(pub KeySpace);
|
pub struct SparseKeySpace(pub KeySpace);
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ use std::{
|
|||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
io::{BufRead, Read},
|
io::{BufRead, Read},
|
||||||
num::{NonZeroU64, NonZeroUsize},
|
num::{NonZeroU64, NonZeroUsize},
|
||||||
|
str::FromStr,
|
||||||
sync::atomic::AtomicUsize,
|
sync::atomic::AtomicUsize,
|
||||||
time::{Duration, SystemTime},
|
time::{Duration, SystemTime},
|
||||||
};
|
};
|
||||||
@@ -228,6 +229,11 @@ pub struct TimelineCreateRequest {
|
|||||||
pub pg_version: Option<u32>,
|
pub pg_version: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
|
pub struct LsnLeaseRequest {
|
||||||
|
pub lsn: Lsn,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct TenantShardSplitRequest {
|
pub struct TenantShardSplitRequest {
|
||||||
pub new_shard_count: u8,
|
pub new_shard_count: u8,
|
||||||
@@ -288,7 +294,6 @@ pub struct TenantConfig {
|
|||||||
pub walreceiver_connect_timeout: Option<String>,
|
pub walreceiver_connect_timeout: Option<String>,
|
||||||
pub lagging_wal_timeout: Option<String>,
|
pub lagging_wal_timeout: Option<String>,
|
||||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||||
pub trace_read_requests: Option<bool>,
|
|
||||||
pub eviction_policy: Option<EvictionPolicy>,
|
pub eviction_policy: Option<EvictionPolicy>,
|
||||||
pub min_resident_size_override: Option<u64>,
|
pub min_resident_size_override: Option<u64>,
|
||||||
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
||||||
@@ -432,6 +437,41 @@ pub enum CompactionAlgorithm {
|
|||||||
Tiered,
|
Tiered,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
pub enum ImageCompressionAlgorithm {
|
||||||
|
// Disabled for writes, support decompressing during read path
|
||||||
|
Disabled,
|
||||||
|
/// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
|
||||||
|
/// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
|
||||||
|
Zstd {
|
||||||
|
level: Option<i8>,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for ImageCompressionAlgorithm {
|
||||||
|
type Err = anyhow::Error;
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
let mut components = s.split(['(', ')']);
|
||||||
|
let first = components
|
||||||
|
.next()
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("empty string"))?;
|
||||||
|
match first {
|
||||||
|
"disabled" => Ok(ImageCompressionAlgorithm::Disabled),
|
||||||
|
"zstd" => {
|
||||||
|
let level = if let Some(v) = components.next() {
|
||||||
|
let v: i8 = v.parse()?;
|
||||||
|
Some(v)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(ImageCompressionAlgorithm::Zstd { level })
|
||||||
|
}
|
||||||
|
_ => anyhow::bail!("invalid specifier '{first}'"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
|
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct CompactionAlgorithmSettings {
|
pub struct CompactionAlgorithmSettings {
|
||||||
pub kind: CompactionAlgorithm,
|
pub kind: CompactionAlgorithm,
|
||||||
@@ -643,6 +683,16 @@ pub struct TimelineInfo {
|
|||||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||||
pub current_logical_size_non_incremental: Option<u64>,
|
pub current_logical_size_non_incremental: Option<u64>,
|
||||||
|
|
||||||
|
/// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
|
||||||
|
/// beyond the branch's branch point, we only count up to the branch point.
|
||||||
|
pub pitr_history_size: u64,
|
||||||
|
|
||||||
|
/// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
|
||||||
|
/// ancestor data used by this branch would have been retained anyway). If this is false, then
|
||||||
|
/// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
|
||||||
|
/// otherwise be able to GC.
|
||||||
|
pub within_ancestor_pitr: bool,
|
||||||
|
|
||||||
pub timeline_dir_layer_file_size_sum: Option<u64>,
|
pub timeline_dir_layer_file_size_sum: Option<u64>,
|
||||||
|
|
||||||
pub wal_source_connstr: Option<String>,
|
pub wal_source_connstr: Option<String>,
|
||||||
@@ -1614,4 +1664,25 @@ mod tests {
|
|||||||
AuxFilePolicy::CrossValidation
|
AuxFilePolicy::CrossValidation
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_image_compression_algorithm_parsing() {
|
||||||
|
use ImageCompressionAlgorithm::*;
|
||||||
|
assert_eq!(
|
||||||
|
ImageCompressionAlgorithm::from_str("disabled").unwrap(),
|
||||||
|
Disabled
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
ImageCompressionAlgorithm::from_str("zstd").unwrap(),
|
||||||
|
Zstd { level: None }
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
|
||||||
|
Zstd { level: Some(18) }
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
|
||||||
|
Zstd { level: Some(-3) }
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,59 +1,42 @@
|
|||||||
use std::{ops::RangeInclusive, str::FromStr};
|
//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
|
||||||
|
//!
|
||||||
|
//! This module contains a variety of types used to represent the concept of sharding
|
||||||
|
//! a Neon tenant across multiple physical shards. Since there are quite a few of these,
|
||||||
|
//! we provide an summary here.
|
||||||
|
//!
|
||||||
|
//! Types used to describe shards:
|
||||||
|
//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
|
||||||
|
//! which identifies a tenant which is not shard-aware. This means its storage paths do not include
|
||||||
|
//! a shard suffix.
|
||||||
|
//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
|
||||||
|
//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
|
||||||
|
//! without the tenant ID. This is useful for things that are implicitly scoped to a particular
|
||||||
|
//! tenant, such as layer files.
|
||||||
|
//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
|
||||||
|
//! detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
|
||||||
|
//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
|
||||||
|
//! four hex digits. An unsharded tenant is `0000`.
|
||||||
|
//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
|
||||||
|
//!
|
||||||
|
//! Types used to describe the parameters for data distribution in a sharded tenant:
|
||||||
|
//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
|
||||||
|
//! multiple shards. Its value is given in 8kiB pages.
|
||||||
|
//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
|
||||||
|
//! always zero: this is provided for future upgrades that might introduce different
|
||||||
|
//! data distribution schemes.
|
||||||
|
//!
|
||||||
|
//! Examples:
|
||||||
|
//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
|
||||||
|
//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
|
||||||
|
//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
||||||
|
//! and their slugs are 0004, 0104, 0204, and 0304.
|
||||||
|
|
||||||
use crate::{key::Key, models::ShardParameters};
|
use crate::{key::Key, models::ShardParameters};
|
||||||
use hex::FromHex;
|
|
||||||
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::TenantId;
|
|
||||||
|
|
||||||
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
|
#[doc(inline)]
|
||||||
///
|
pub use ::utils::shard::*;
|
||||||
/// This module contains a variety of types used to represent the concept of sharding
|
|
||||||
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
|
|
||||||
/// we provide an summary here.
|
|
||||||
///
|
|
||||||
/// Types used to describe shards:
|
|
||||||
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
|
|
||||||
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
|
|
||||||
/// a shard suffix.
|
|
||||||
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
|
|
||||||
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
|
|
||||||
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
|
|
||||||
/// tenant, such as layer files.
|
|
||||||
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
|
|
||||||
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
|
|
||||||
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
|
|
||||||
/// four hex digits. An unsharded tenant is `0000`.
|
|
||||||
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
|
|
||||||
///
|
|
||||||
/// Types used to describe the parameters for data distribution in a sharded tenant:
|
|
||||||
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
|
|
||||||
/// multiple shards. Its value is given in 8kiB pages.
|
|
||||||
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
|
|
||||||
/// always zero: this is provided for future upgrades that might introduce different
|
|
||||||
/// data distribution schemes.
|
|
||||||
///
|
|
||||||
/// Examples:
|
|
||||||
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
|
|
||||||
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
|
|
||||||
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
|
||||||
/// and their slugs are 0004, 0104, 0204, and 0304.
|
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
|
||||||
pub struct ShardNumber(pub u8);
|
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
|
||||||
pub struct ShardCount(u8);
|
|
||||||
|
|
||||||
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
|
||||||
/// when we need to know which shard we're dealing with, but do not need to know the full
|
|
||||||
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
|
||||||
/// the fully qualified TenantShardId.
|
|
||||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
|
||||||
pub struct ShardIndex {
|
|
||||||
pub shard_number: ShardNumber,
|
|
||||||
pub shard_count: ShardCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
|
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
|
||||||
/// and to check whether that [`ShardNumber`] is the same as the current shard.
|
/// and to check whether that [`ShardNumber`] is the same as the current shard.
|
||||||
@@ -65,362 +48,6 @@ pub struct ShardIdentity {
|
|||||||
layout: ShardLayout,
|
layout: ShardLayout,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Formatting helper, for generating the `shard_id` label in traces.
|
|
||||||
struct ShardSlug<'a>(&'a TenantShardId);
|
|
||||||
|
|
||||||
/// TenantShardId globally identifies a particular shard in a particular tenant.
|
|
||||||
///
|
|
||||||
/// These are written as `<TenantId>-<ShardSlug>`, for example:
|
|
||||||
/// # The second shard in a two-shard tenant
|
|
||||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
|
||||||
///
|
|
||||||
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
|
|
||||||
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
|
|
||||||
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
|
|
||||||
///
|
|
||||||
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
|
|
||||||
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
|
|
||||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
|
||||||
/// as a TenantId.
|
|
||||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
|
||||||
pub struct TenantShardId {
|
|
||||||
pub tenant_id: TenantId,
|
|
||||||
pub shard_number: ShardNumber,
|
|
||||||
pub shard_count: ShardCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ShardCount {
|
|
||||||
pub const MAX: Self = Self(u8::MAX);
|
|
||||||
|
|
||||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
|
||||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
|
||||||
/// as [`TenantShardId::unsharded`].
|
|
||||||
///
|
|
||||||
/// This method returns the actual number of shards, i.e. if our internal value is
|
|
||||||
/// zero, we return 1 (unsharded tenants have 1 shard).
|
|
||||||
pub fn count(&self) -> u8 {
|
|
||||||
if self.0 > 0 {
|
|
||||||
self.0
|
|
||||||
} else {
|
|
||||||
1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The literal internal value: this is **not** the number of shards in the
|
|
||||||
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
|
|
||||||
/// [`Self::count`] if you want to know the cardinality of shards.
|
|
||||||
pub fn literal(&self) -> u8 {
|
|
||||||
self.0
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
|
|
||||||
/// uses the legacy format for `TenantShardId`. See also the documentation for
|
|
||||||
/// [`Self::count`].
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
|
||||||
self.0 == 0
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
|
||||||
/// [`Self::literal`] would return.
|
|
||||||
pub const fn new(val: u8) -> Self {
|
|
||||||
Self(val)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ShardNumber {
|
|
||||||
pub const MAX: Self = Self(u8::MAX);
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TenantShardId {
|
|
||||||
pub fn unsharded(tenant_id: TenantId) -> Self {
|
|
||||||
Self {
|
|
||||||
tenant_id,
|
|
||||||
shard_number: ShardNumber(0),
|
|
||||||
shard_count: ShardCount(0),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The range of all TenantShardId that belong to a particular TenantId. This is useful when
|
|
||||||
/// you have a BTreeMap of TenantShardId, and are querying by TenantId.
|
|
||||||
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
|
||||||
RangeInclusive::new(
|
|
||||||
Self {
|
|
||||||
tenant_id,
|
|
||||||
shard_number: ShardNumber(0),
|
|
||||||
shard_count: ShardCount(0),
|
|
||||||
},
|
|
||||||
Self {
|
|
||||||
tenant_id,
|
|
||||||
shard_number: ShardNumber::MAX,
|
|
||||||
shard_count: ShardCount::MAX,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
|
|
||||||
ShardSlug(self)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Convenience for code that has special behavior on the 0th shard.
|
|
||||||
pub fn is_shard_zero(&self) -> bool {
|
|
||||||
self.shard_number == ShardNumber(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
|
||||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
|
||||||
/// a shard suffix.
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
|
||||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
|
|
||||||
/// is useful when logging from code that is already in a span that includes tenant ID, to
|
|
||||||
/// keep messages reasonably terse.
|
|
||||||
pub fn to_index(&self) -> ShardIndex {
|
|
||||||
ShardIndex {
|
|
||||||
shard_number: self.shard_number,
|
|
||||||
shard_count: self.shard_count,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Calculate the children of this TenantShardId when splitting the overall tenant into
|
|
||||||
/// the given number of shards.
|
|
||||||
pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
|
|
||||||
let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
|
|
||||||
let mut child_shards = Vec::new();
|
|
||||||
for shard_number in 0..ShardNumber(new_shard_count.0).0 {
|
|
||||||
// Key mapping is based on a round robin mapping of key hash modulo shard count,
|
|
||||||
// so our child shards are the ones which the same keys would map to.
|
|
||||||
if shard_number % effective_old_shard_count == self.shard_number.0 {
|
|
||||||
child_shards.push(TenantShardId {
|
|
||||||
tenant_id: self.tenant_id,
|
|
||||||
shard_number: ShardNumber(shard_number),
|
|
||||||
shard_count: new_shard_count,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
child_shards
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(
|
|
||||||
f,
|
|
||||||
"{:02x}{:02x}",
|
|
||||||
self.0.shard_number.0, self.0.shard_count.0
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Display for TenantShardId {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
if self.shard_count != ShardCount(0) {
|
|
||||||
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
|
|
||||||
} else {
|
|
||||||
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
|
|
||||||
// is distinct from the normal single shard case (shard count == 1).
|
|
||||||
self.tenant_id.fmt(f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Debug for TenantShardId {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
// Debug is the same as Display: the compact hex representation
|
|
||||||
write!(f, "{}", self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::str::FromStr for TenantShardId {
|
|
||||||
type Err = hex::FromHexError;
|
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
// Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
|
|
||||||
if s.len() == 32 {
|
|
||||||
// Legacy case: no shard specified
|
|
||||||
Ok(Self {
|
|
||||||
tenant_id: TenantId::from_str(s)?,
|
|
||||||
shard_number: ShardNumber(0),
|
|
||||||
shard_count: ShardCount(0),
|
|
||||||
})
|
|
||||||
} else if s.len() == 37 {
|
|
||||||
let bytes = s.as_bytes();
|
|
||||||
let tenant_id = TenantId::from_hex(&bytes[0..32])?;
|
|
||||||
let mut shard_parts: [u8; 2] = [0u8; 2];
|
|
||||||
hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
|
|
||||||
Ok(Self {
|
|
||||||
tenant_id,
|
|
||||||
shard_number: ShardNumber(shard_parts[0]),
|
|
||||||
shard_count: ShardCount(shard_parts[1]),
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
Err(hex::FromHexError::InvalidStringLength)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<[u8; 18]> for TenantShardId {
|
|
||||||
fn from(b: [u8; 18]) -> Self {
|
|
||||||
let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
|
|
||||||
|
|
||||||
Self {
|
|
||||||
tenant_id: TenantId::from(tenant_id_bytes),
|
|
||||||
shard_number: ShardNumber(b[16]),
|
|
||||||
shard_count: ShardCount(b[17]),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ShardIndex {
|
|
||||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
|
||||||
Self {
|
|
||||||
shard_number: number,
|
|
||||||
shard_count: count,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub fn unsharded() -> Self {
|
|
||||||
Self {
|
|
||||||
shard_number: ShardNumber(0),
|
|
||||||
shard_count: ShardCount(0),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
|
||||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
|
||||||
/// a shard suffix.
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
|
||||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// For use in constructing remote storage paths: concatenate this with a TenantId
|
|
||||||
/// to get a fully qualified TenantShardId.
|
|
||||||
///
|
|
||||||
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
|
|
||||||
/// that the legacy pre-sharding remote key format is preserved.
|
|
||||||
pub fn get_suffix(&self) -> String {
|
|
||||||
if self.is_unsharded() {
|
|
||||||
"".to_string()
|
|
||||||
} else {
|
|
||||||
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Display for ShardIndex {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Debug for ShardIndex {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
// Debug is the same as Display: the compact hex representation
|
|
||||||
write!(f, "{}", self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::str::FromStr for ShardIndex {
|
|
||||||
type Err = hex::FromHexError;
|
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
// Expect format: 1 byte shard number, 1 byte shard count
|
|
||||||
if s.len() == 4 {
|
|
||||||
let bytes = s.as_bytes();
|
|
||||||
let mut shard_parts: [u8; 2] = [0u8; 2];
|
|
||||||
hex::decode_to_slice(bytes, &mut shard_parts)?;
|
|
||||||
Ok(Self {
|
|
||||||
shard_number: ShardNumber(shard_parts[0]),
|
|
||||||
shard_count: ShardCount(shard_parts[1]),
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
Err(hex::FromHexError::InvalidStringLength)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<[u8; 2]> for ShardIndex {
|
|
||||||
fn from(b: [u8; 2]) -> Self {
|
|
||||||
Self {
|
|
||||||
shard_number: ShardNumber(b[0]),
|
|
||||||
shard_count: ShardCount(b[1]),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Serialize for TenantShardId {
|
|
||||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
||||||
where
|
|
||||||
S: serde::Serializer,
|
|
||||||
{
|
|
||||||
if serializer.is_human_readable() {
|
|
||||||
serializer.collect_str(self)
|
|
||||||
} else {
|
|
||||||
// Note: while human encoding of [`TenantShardId`] is backward and forward
|
|
||||||
// compatible, this binary encoding is not.
|
|
||||||
let mut packed: [u8; 18] = [0; 18];
|
|
||||||
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
|
||||||
packed[16] = self.shard_number.0;
|
|
||||||
packed[17] = self.shard_count.0;
|
|
||||||
|
|
||||||
packed.serialize(serializer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> Deserialize<'de> for TenantShardId {
|
|
||||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
struct IdVisitor {
|
|
||||||
is_human_readable_deserializer: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
|
||||||
type Value = TenantShardId;
|
|
||||||
|
|
||||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
||||||
if self.is_human_readable_deserializer {
|
|
||||||
formatter.write_str("value in form of hex string")
|
|
||||||
} else {
|
|
||||||
formatter.write_str("value in form of integer array([u8; 18])")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
|
||||||
where
|
|
||||||
A: serde::de::SeqAccess<'de>,
|
|
||||||
{
|
|
||||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
|
||||||
let id: [u8; 18] = Deserialize::deserialize(s)?;
|
|
||||||
Ok(TenantShardId::from(id))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
|
||||||
where
|
|
||||||
E: serde::de::Error,
|
|
||||||
{
|
|
||||||
TenantShardId::from_str(v).map_err(E::custom)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if deserializer.is_human_readable() {
|
|
||||||
deserializer.deserialize_str(IdVisitor {
|
|
||||||
is_human_readable_deserializer: true,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
deserializer.deserialize_tuple(
|
|
||||||
18,
|
|
||||||
IdVisitor {
|
|
||||||
is_human_readable_deserializer: false,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Stripe size in number of pages
|
/// Stripe size in number of pages
|
||||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||||
pub struct ShardStripeSize(pub u32);
|
pub struct ShardStripeSize(pub u32);
|
||||||
@@ -585,77 +212,6 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Serialize for ShardIndex {
|
|
||||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
||||||
where
|
|
||||||
S: serde::Serializer,
|
|
||||||
{
|
|
||||||
if serializer.is_human_readable() {
|
|
||||||
serializer.collect_str(self)
|
|
||||||
} else {
|
|
||||||
// Binary encoding is not used in index_part.json, but is included in anticipation of
|
|
||||||
// switching various structures (e.g. inter-process communication, remote metadata) to more
|
|
||||||
// compact binary encodings in future.
|
|
||||||
let mut packed: [u8; 2] = [0; 2];
|
|
||||||
packed[0] = self.shard_number.0;
|
|
||||||
packed[1] = self.shard_count.0;
|
|
||||||
packed.serialize(serializer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> Deserialize<'de> for ShardIndex {
|
|
||||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
struct IdVisitor {
|
|
||||||
is_human_readable_deserializer: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
|
||||||
type Value = ShardIndex;
|
|
||||||
|
|
||||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
||||||
if self.is_human_readable_deserializer {
|
|
||||||
formatter.write_str("value in form of hex string")
|
|
||||||
} else {
|
|
||||||
formatter.write_str("value in form of integer array([u8; 2])")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
|
||||||
where
|
|
||||||
A: serde::de::SeqAccess<'de>,
|
|
||||||
{
|
|
||||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
|
||||||
let id: [u8; 2] = Deserialize::deserialize(s)?;
|
|
||||||
Ok(ShardIndex::from(id))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
|
||||||
where
|
|
||||||
E: serde::de::Error,
|
|
||||||
{
|
|
||||||
ShardIndex::from_str(v).map_err(E::custom)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if deserializer.is_human_readable() {
|
|
||||||
deserializer.deserialize_str(IdVisitor {
|
|
||||||
is_human_readable_deserializer: true,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
deserializer.deserialize_tuple(
|
|
||||||
2,
|
|
||||||
IdVisitor {
|
|
||||||
is_human_readable_deserializer: false,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
|
/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
|
||||||
/// in order to be able to serve basebackup requests without peer communication).
|
/// in order to be able to serve basebackup requests without peer communication).
|
||||||
fn key_is_shard0(key: &Key) -> bool {
|
fn key_is_shard0(key: &Key) -> bool {
|
||||||
@@ -737,7 +293,9 @@ pub fn describe(
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use utils::Hex;
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use utils::{id::TenantId, Hex};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ rustls.workspace = true
|
|||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
|
tokio-util.workspace = true
|
||||||
tokio-rustls.workspace = true
|
tokio-rustls.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
|
|
||||||
@@ -23,4 +24,4 @@ workspace_hack.workspace = true
|
|||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
rustls-pemfile.workspace = true
|
rustls-pemfile.workspace = true
|
||||||
tokio-postgres.workspace = true
|
tokio-postgres.workspace = true
|
||||||
tokio-postgres-rustls.workspace = true
|
tokio-postgres-rustls.workspace = true
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ use std::{fmt, io};
|
|||||||
use std::{future::Future, str::FromStr};
|
use std::{future::Future, str::FromStr};
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
use tokio_rustls::TlsAcceptor;
|
use tokio_rustls::TlsAcceptor;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, error, info, trace, warn};
|
use tracing::{debug, error, info, trace, warn};
|
||||||
|
|
||||||
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
|
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
|
||||||
@@ -400,21 +401,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Wrapper for run_message_loop() that shuts down socket when we are done
|
/// Wrapper for run_message_loop() that shuts down socket when we are done
|
||||||
pub async fn run<F, S>(
|
pub async fn run(
|
||||||
mut self,
|
mut self,
|
||||||
handler: &mut impl Handler<IO>,
|
handler: &mut impl Handler<IO>,
|
||||||
shutdown_watcher: F,
|
cancel: &CancellationToken,
|
||||||
) -> Result<(), QueryError>
|
) -> Result<(), QueryError> {
|
||||||
where
|
let ret = self.run_message_loop(handler, cancel).await;
|
||||||
F: Fn() -> S + Clone,
|
|
||||||
S: Future,
|
|
||||||
{
|
|
||||||
let ret = self
|
|
||||||
.run_message_loop(handler, shutdown_watcher.clone())
|
|
||||||
.await;
|
|
||||||
|
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = shutdown_watcher() => {
|
_ = cancel.cancelled() => {
|
||||||
// do nothing; we most likely got already stopped by shutdown and will log it next.
|
// do nothing; we most likely got already stopped by shutdown and will log it next.
|
||||||
}
|
}
|
||||||
_ = self.framed.shutdown() => {
|
_ = self.framed.shutdown() => {
|
||||||
@@ -444,21 +439,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn run_message_loop<F, S>(
|
async fn run_message_loop(
|
||||||
&mut self,
|
&mut self,
|
||||||
handler: &mut impl Handler<IO>,
|
handler: &mut impl Handler<IO>,
|
||||||
shutdown_watcher: F,
|
cancel: &CancellationToken,
|
||||||
) -> Result<(), QueryError>
|
) -> Result<(), QueryError> {
|
||||||
where
|
|
||||||
F: Fn() -> S,
|
|
||||||
S: Future,
|
|
||||||
{
|
|
||||||
trace!("postgres backend to {:?} started", self.peer_addr);
|
trace!("postgres backend to {:?} started", self.peer_addr);
|
||||||
|
|
||||||
tokio::select!(
|
tokio::select!(
|
||||||
biased;
|
biased;
|
||||||
|
|
||||||
_ = shutdown_watcher() => {
|
_ = cancel.cancelled() => {
|
||||||
// We were requested to shut down.
|
// We were requested to shut down.
|
||||||
tracing::info!("shutdown request received during handshake");
|
tracing::info!("shutdown request received during handshake");
|
||||||
return Err(QueryError::Shutdown)
|
return Err(QueryError::Shutdown)
|
||||||
@@ -473,7 +464,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
let mut query_string = Bytes::new();
|
let mut query_string = Bytes::new();
|
||||||
while let Some(msg) = tokio::select!(
|
while let Some(msg) = tokio::select!(
|
||||||
biased;
|
biased;
|
||||||
_ = shutdown_watcher() => {
|
_ = cancel.cancelled() => {
|
||||||
// We were requested to shut down.
|
// We were requested to shut down.
|
||||||
tracing::info!("shutdown request received in run_message_loop");
|
tracing::info!("shutdown request received in run_message_loop");
|
||||||
return Err(QueryError::Shutdown)
|
return Err(QueryError::Shutdown)
|
||||||
@@ -485,7 +476,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
let result = self.process_message(handler, msg, &mut query_string).await;
|
let result = self.process_message(handler, msg, &mut query_string).await;
|
||||||
tokio::select!(
|
tokio::select!(
|
||||||
biased;
|
biased;
|
||||||
_ = shutdown_watcher() => {
|
_ = cancel.cancelled() => {
|
||||||
// We were requested to shut down.
|
// We were requested to shut down.
|
||||||
tracing::info!("shutdown request received during response flush");
|
tracing::info!("shutdown request received during response flush");
|
||||||
|
|
||||||
@@ -672,11 +663,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
assert!(self.state < ProtoState::Authentication);
|
assert!(self.state < ProtoState::Authentication);
|
||||||
let have_tls = self.tls_config.is_some();
|
let have_tls = self.tls_config.is_some();
|
||||||
match msg {
|
match msg {
|
||||||
FeStartupPacket::SslRequest => {
|
FeStartupPacket::SslRequest { direct } => {
|
||||||
debug!("SSL requested");
|
debug!("SSL requested");
|
||||||
|
|
||||||
self.write_message(&BeMessage::EncryptionResponse(have_tls))
|
if !direct {
|
||||||
.await?;
|
self.write_message(&BeMessage::EncryptionResponse(have_tls))
|
||||||
|
.await?;
|
||||||
|
} else if !have_tls {
|
||||||
|
return Err(QueryError::Other(anyhow::anyhow!(
|
||||||
|
"direct SSL negotiation but no TLS support"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
if have_tls {
|
if have_tls {
|
||||||
self.start_tls().await?;
|
self.start_tls().await?;
|
||||||
|
|||||||
@@ -3,13 +3,14 @@ use once_cell::sync::Lazy;
|
|||||||
use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
|
use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
|
||||||
use pq_proto::{BeMessage, RowDescriptor};
|
use pq_proto::{BeMessage, RowDescriptor};
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use std::{future, sync::Arc};
|
use std::sync::Arc;
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
use tokio::net::{TcpListener, TcpStream};
|
use tokio::net::{TcpListener, TcpStream};
|
||||||
use tokio_postgres::config::SslMode;
|
use tokio_postgres::config::SslMode;
|
||||||
use tokio_postgres::tls::MakeTlsConnect;
|
use tokio_postgres::tls::MakeTlsConnect;
|
||||||
use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
|
use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
|
||||||
use tokio_postgres_rustls::MakeRustlsConnect;
|
use tokio_postgres_rustls::MakeRustlsConnect;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
// generate client, server test streams
|
// generate client, server test streams
|
||||||
async fn make_tcp_pair() -> (TcpStream, TcpStream) {
|
async fn make_tcp_pair() -> (TcpStream, TcpStream) {
|
||||||
@@ -50,7 +51,7 @@ async fn simple_select() {
|
|||||||
|
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
let mut handler = TestHandler {};
|
let mut handler = TestHandler {};
|
||||||
pgbackend.run(&mut handler, future::pending::<()>).await
|
pgbackend.run(&mut handler, &CancellationToken::new()).await
|
||||||
});
|
});
|
||||||
|
|
||||||
let conf = Config::new();
|
let conf = Config::new();
|
||||||
@@ -102,7 +103,7 @@ async fn simple_select_ssl() {
|
|||||||
|
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
let mut handler = TestHandler {};
|
let mut handler = TestHandler {};
|
||||||
pgbackend.run(&mut handler, future::pending::<()>).await
|
pgbackend.run(&mut handler, &CancellationToken::new()).await
|
||||||
});
|
});
|
||||||
|
|
||||||
let client_cfg = rustls::ClientConfig::builder()
|
let client_cfg = rustls::ClientConfig::builder()
|
||||||
|
|||||||
@@ -44,9 +44,9 @@ impl ConnectionError {
|
|||||||
/// Wraps async io `stream`, providing messages to write/flush + read Postgres
|
/// Wraps async io `stream`, providing messages to write/flush + read Postgres
|
||||||
/// messages.
|
/// messages.
|
||||||
pub struct Framed<S> {
|
pub struct Framed<S> {
|
||||||
stream: S,
|
pub stream: S,
|
||||||
read_buf: BytesMut,
|
pub read_buf: BytesMut,
|
||||||
write_buf: BytesMut,
|
pub write_buf: BytesMut,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<S> Framed<S> {
|
impl<S> Framed<S> {
|
||||||
|
|||||||
@@ -39,14 +39,39 @@ pub enum FeMessage {
|
|||||||
PasswordMessage(Bytes),
|
PasswordMessage(Bytes),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, PartialOrd)]
|
||||||
|
pub struct ProtocolVersion(u32);
|
||||||
|
|
||||||
|
impl ProtocolVersion {
|
||||||
|
pub const fn new(major: u16, minor: u16) -> Self {
|
||||||
|
Self((major as u32) << 16 | minor as u32)
|
||||||
|
}
|
||||||
|
pub const fn minor(self) -> u16 {
|
||||||
|
self.0 as u16
|
||||||
|
}
|
||||||
|
pub const fn major(self) -> u16 {
|
||||||
|
(self.0 >> 16) as u16
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for ProtocolVersion {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
f.debug_list()
|
||||||
|
.entry(&self.major())
|
||||||
|
.entry(&self.minor())
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum FeStartupPacket {
|
pub enum FeStartupPacket {
|
||||||
CancelRequest(CancelKeyData),
|
CancelRequest(CancelKeyData),
|
||||||
SslRequest,
|
SslRequest {
|
||||||
|
direct: bool,
|
||||||
|
},
|
||||||
GssEncRequest,
|
GssEncRequest,
|
||||||
StartupMessage {
|
StartupMessage {
|
||||||
major_version: u32,
|
version: ProtocolVersion,
|
||||||
minor_version: u32,
|
|
||||||
params: StartupMessageParams,
|
params: StartupMessageParams,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -301,11 +326,23 @@ impl FeStartupPacket {
|
|||||||
/// different from [`FeMessage::parse`] because startup messages don't have
|
/// different from [`FeMessage::parse`] because startup messages don't have
|
||||||
/// message type byte; otherwise, its comments apply.
|
/// message type byte; otherwise, its comments apply.
|
||||||
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
|
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
|
||||||
|
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
|
||||||
const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
|
const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
|
||||||
const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
|
const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
|
||||||
const CANCEL_REQUEST_CODE: u32 = 5678;
|
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
|
||||||
const NEGOTIATE_SSL_CODE: u32 = 5679;
|
const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
|
||||||
const NEGOTIATE_GSS_CODE: u32 = 5680;
|
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
|
||||||
|
const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
|
||||||
|
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
|
||||||
|
const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
|
||||||
|
|
||||||
|
// <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
|
||||||
|
// First byte indicates standard SSL handshake message
|
||||||
|
// (It can't be a Postgres startup length because in network byte order
|
||||||
|
// that would be a startup packet hundreds of megabytes long)
|
||||||
|
if buf.first() == Some(&0x16) {
|
||||||
|
return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
|
||||||
|
}
|
||||||
|
|
||||||
// need at least 4 bytes with packet len
|
// need at least 4 bytes with packet len
|
||||||
if buf.len() < 4 {
|
if buf.len() < 4 {
|
||||||
@@ -338,12 +375,10 @@ impl FeStartupPacket {
|
|||||||
let mut msg = buf.split_to(len).freeze();
|
let mut msg = buf.split_to(len).freeze();
|
||||||
msg.advance(4); // consume len
|
msg.advance(4); // consume len
|
||||||
|
|
||||||
let request_code = msg.get_u32();
|
let request_code = ProtocolVersion(msg.get_u32());
|
||||||
let req_hi = request_code >> 16;
|
|
||||||
let req_lo = request_code & ((1 << 16) - 1);
|
|
||||||
// StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
|
// StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
|
||||||
let message = match (req_hi, req_lo) {
|
let message = match request_code {
|
||||||
(RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
|
CANCEL_REQUEST_CODE => {
|
||||||
if msg.remaining() != 8 {
|
if msg.remaining() != 8 {
|
||||||
return Err(ProtocolError::BadMessage(
|
return Err(ProtocolError::BadMessage(
|
||||||
"CancelRequest message is malformed, backend PID / secret key missing"
|
"CancelRequest message is malformed, backend PID / secret key missing"
|
||||||
@@ -355,21 +390,22 @@ impl FeStartupPacket {
|
|||||||
cancel_key: msg.get_i32(),
|
cancel_key: msg.get_i32(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
|
NEGOTIATE_SSL_CODE => {
|
||||||
// Requested upgrade to SSL (aka TLS)
|
// Requested upgrade to SSL (aka TLS)
|
||||||
FeStartupPacket::SslRequest
|
FeStartupPacket::SslRequest { direct: false }
|
||||||
}
|
}
|
||||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
|
NEGOTIATE_GSS_CODE => {
|
||||||
// Requested upgrade to GSSAPI
|
// Requested upgrade to GSSAPI
|
||||||
FeStartupPacket::GssEncRequest
|
FeStartupPacket::GssEncRequest
|
||||||
}
|
}
|
||||||
(RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
|
version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
|
||||||
return Err(ProtocolError::Protocol(format!(
|
return Err(ProtocolError::Protocol(format!(
|
||||||
"Unrecognized request code {unrecognized_code}"
|
"Unrecognized request code {}",
|
||||||
|
version.minor()
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
// TODO bail if protocol major_version is not 3?
|
// TODO bail if protocol major_version is not 3?
|
||||||
(major_version, minor_version) => {
|
version => {
|
||||||
// StartupMessage
|
// StartupMessage
|
||||||
|
|
||||||
let s = str::from_utf8(&msg).map_err(|_e| {
|
let s = str::from_utf8(&msg).map_err(|_e| {
|
||||||
@@ -382,8 +418,7 @@ impl FeStartupPacket {
|
|||||||
})?;
|
})?;
|
||||||
|
|
||||||
FeStartupPacket::StartupMessage {
|
FeStartupPacket::StartupMessage {
|
||||||
major_version,
|
version,
|
||||||
minor_version,
|
|
||||||
params: StartupMessageParams {
|
params: StartupMessageParams {
|
||||||
params: msg.slice_ref(s.as_bytes()),
|
params: msg.slice_ref(s.as_bytes()),
|
||||||
},
|
},
|
||||||
@@ -522,6 +557,10 @@ pub enum BeMessage<'a> {
|
|||||||
RowDescription(&'a [RowDescriptor<'a>]),
|
RowDescription(&'a [RowDescriptor<'a>]),
|
||||||
XLogData(XLogDataBody<'a>),
|
XLogData(XLogDataBody<'a>),
|
||||||
NoticeResponse(&'a str),
|
NoticeResponse(&'a str),
|
||||||
|
NegotiateProtocolVersion {
|
||||||
|
version: ProtocolVersion,
|
||||||
|
options: &'a [&'a str],
|
||||||
|
},
|
||||||
KeepAlive(WalSndKeepAlive),
|
KeepAlive(WalSndKeepAlive),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -945,6 +984,18 @@ impl<'a> BeMessage<'a> {
|
|||||||
buf.put_u8(u8::from(req.request_reply));
|
buf.put_u8(u8::from(req.request_reply));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BeMessage::NegotiateProtocolVersion { version, options } => {
|
||||||
|
buf.put_u8(b'v');
|
||||||
|
write_body(buf, |buf| {
|
||||||
|
buf.put_u32(version.0);
|
||||||
|
buf.put_u32(options.len() as u32);
|
||||||
|
for option in options.iter() {
|
||||||
|
write_cstr(option, buf)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
})?
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
|
use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
|
||||||
|
|
||||||
use anyhow::bail;
|
|
||||||
use aws_sdk_s3::types::StorageClass;
|
use aws_sdk_s3::types::StorageClass;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
|
|
||||||
@@ -176,20 +175,8 @@ fn serialize_storage_class<S: serde::Serializer>(
|
|||||||
impl RemoteStorageConfig {
|
impl RemoteStorageConfig {
|
||||||
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
|
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
|
||||||
|
|
||||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
|
||||||
let document: toml_edit::Document = match toml {
|
Ok(utils::toml_edit_ext::deserialize_item(toml)?)
|
||||||
toml_edit::Item::Table(toml) => toml.clone().into(),
|
|
||||||
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
|
|
||||||
toml.clone().into_table().into()
|
|
||||||
}
|
|
||||||
_ => bail!("toml not a table or inline table"),
|
|
||||||
};
|
|
||||||
|
|
||||||
if document.is_empty() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(Some(toml_edit::de::from_document(document)?))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -197,7 +184,7 @@ impl RemoteStorageConfig {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
|
||||||
let toml = input.parse::<toml_edit::Document>().unwrap();
|
let toml = input.parse::<toml_edit::Document>().unwrap();
|
||||||
RemoteStorageConfig::from_toml(toml.as_item())
|
RemoteStorageConfig::from_toml(toml.as_item())
|
||||||
}
|
}
|
||||||
@@ -207,7 +194,7 @@ mod tests {
|
|||||||
let input = "local_path = '.'
|
let input = "local_path = '.'
|
||||||
timeout = '5s'";
|
timeout = '5s'";
|
||||||
|
|
||||||
let config = parse(input).unwrap().expect("it exists");
|
let config = parse(input).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
config,
|
config,
|
||||||
@@ -229,7 +216,7 @@ timeout = '5s'";
|
|||||||
timeout = '7s'
|
timeout = '7s'
|
||||||
";
|
";
|
||||||
|
|
||||||
let config = parse(toml).unwrap().expect("it exists");
|
let config = parse(toml).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
config,
|
config,
|
||||||
@@ -257,7 +244,7 @@ timeout = '5s'";
|
|||||||
timeout = '7s'
|
timeout = '7s'
|
||||||
";
|
";
|
||||||
|
|
||||||
let config = parse(toml).unwrap().expect("it exists");
|
let config = parse(toml).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
config,
|
config,
|
||||||
|
|||||||
@@ -34,10 +34,10 @@ struct SegmentSize {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct SizeAlternatives {
|
struct SizeAlternatives {
|
||||||
// cheapest alternative if parent is available.
|
/// cheapest alternative if parent is available.
|
||||||
incremental: SegmentSize,
|
incremental: SegmentSize,
|
||||||
|
|
||||||
// cheapest alternative if parent node is not available
|
/// cheapest alternative if parent node is not available
|
||||||
non_incremental: Option<SegmentSize>,
|
non_incremental: Option<SegmentSize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3,10 +3,17 @@ use std::fmt::Write;
|
|||||||
|
|
||||||
const SVG_WIDTH: f32 = 500.0;
|
const SVG_WIDTH: f32 = 500.0;
|
||||||
|
|
||||||
|
/// Different branch kind for SVG drawing.
|
||||||
|
#[derive(PartialEq)]
|
||||||
|
pub enum SvgBranchKind {
|
||||||
|
Timeline,
|
||||||
|
Lease,
|
||||||
|
}
|
||||||
|
|
||||||
struct SvgDraw<'a> {
|
struct SvgDraw<'a> {
|
||||||
storage: &'a StorageModel,
|
storage: &'a StorageModel,
|
||||||
branches: &'a [String],
|
branches: &'a [String],
|
||||||
seg_to_branch: &'a [usize],
|
seg_to_branch: &'a [(usize, SvgBranchKind)],
|
||||||
sizes: &'a [SegmentSizeResult],
|
sizes: &'a [SegmentSizeResult],
|
||||||
|
|
||||||
// layout
|
// layout
|
||||||
@@ -42,13 +49,18 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> {
|
|||||||
"<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
|
"<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
|
||||||
)?;
|
)?;
|
||||||
writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
|
writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
|
||||||
|
writeln!(
|
||||||
|
result,
|
||||||
|
"<line x1=\"10\" y1=\"85\" x2=\"10\" y2=\"95\" stroke-width=\"3\" stroke=\"blue\" />"
|
||||||
|
)?;
|
||||||
|
writeln!(result, "<text x=\"20\" y=\"95\">LSN lease</text>")?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn draw_svg(
|
pub fn draw_svg(
|
||||||
storage: &StorageModel,
|
storage: &StorageModel,
|
||||||
branches: &[String],
|
branches: &[String],
|
||||||
seg_to_branch: &[usize],
|
seg_to_branch: &[(usize, SvgBranchKind)],
|
||||||
sizes: &SizeResult,
|
sizes: &SizeResult,
|
||||||
) -> anyhow::Result<String> {
|
) -> anyhow::Result<String> {
|
||||||
let mut draw = SvgDraw {
|
let mut draw = SvgDraw {
|
||||||
@@ -100,7 +112,7 @@ impl<'a> SvgDraw<'a> {
|
|||||||
|
|
||||||
// Layout the timelines on Y dimension.
|
// Layout the timelines on Y dimension.
|
||||||
// TODO
|
// TODO
|
||||||
let mut y = 100.0;
|
let mut y = 120.0;
|
||||||
let mut branch_y_coordinates = Vec::new();
|
let mut branch_y_coordinates = Vec::new();
|
||||||
for _branch in self.branches {
|
for _branch in self.branches {
|
||||||
branch_y_coordinates.push(y);
|
branch_y_coordinates.push(y);
|
||||||
@@ -109,7 +121,7 @@ impl<'a> SvgDraw<'a> {
|
|||||||
|
|
||||||
// Calculate coordinates for each point
|
// Calculate coordinates for each point
|
||||||
let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
|
let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
|
||||||
.map(|(seg, branch_id)| {
|
.map(|(seg, (branch_id, _))| {
|
||||||
let x = (seg.lsn - min_lsn) as f32 / xscale;
|
let x = (seg.lsn - min_lsn) as f32 / xscale;
|
||||||
let y = branch_y_coordinates[*branch_id];
|
let y = branch_y_coordinates[*branch_id];
|
||||||
(x, y)
|
(x, y)
|
||||||
@@ -175,6 +187,22 @@ impl<'a> SvgDraw<'a> {
|
|||||||
|
|
||||||
// draw a snapshot point if it's needed
|
// draw a snapshot point if it's needed
|
||||||
let (coord_x, coord_y) = self.seg_coordinates[seg_id];
|
let (coord_x, coord_y) = self.seg_coordinates[seg_id];
|
||||||
|
|
||||||
|
let (_, kind) = &self.seg_to_branch[seg_id];
|
||||||
|
if kind == &SvgBranchKind::Lease {
|
||||||
|
let (x1, y1) = (coord_x, coord_y - 10.0);
|
||||||
|
let (x2, y2) = (coord_x, coord_y + 10.0);
|
||||||
|
|
||||||
|
let style = "stroke-width=\"3\" stroke=\"blue\"";
|
||||||
|
|
||||||
|
writeln!(
|
||||||
|
result,
|
||||||
|
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
|
||||||
|
)?;
|
||||||
|
writeln!(result, " <title>leased lsn at {}</title>", seg.lsn)?;
|
||||||
|
writeln!(result, "</line>")?;
|
||||||
|
}
|
||||||
|
|
||||||
if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
|
if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
|
||||||
writeln!(
|
writeln!(
|
||||||
result,
|
result,
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ thiserror.workspace = true
|
|||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
tokio-tar.workspace = true
|
tokio-tar.workspace = true
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
|
toml_edit.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
tracing-error.workspace = true
|
tracing-error.workspace = true
|
||||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||||
|
|||||||
114
libs/utils/src/circuit_breaker.rs
Normal file
114
libs/utils/src/circuit_breaker.rs
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
use std::{
|
||||||
|
fmt::Display,
|
||||||
|
time::{Duration, Instant},
|
||||||
|
};
|
||||||
|
|
||||||
|
use metrics::IntCounter;
|
||||||
|
|
||||||
|
/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
|
||||||
|
/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
|
||||||
|
/// to mitigate the log spam from repeated failures.
|
||||||
|
pub struct CircuitBreaker {
|
||||||
|
/// An identifier that enables us to log useful errors when a circuit is broken
|
||||||
|
name: String,
|
||||||
|
|
||||||
|
/// Consecutive failures since last success
|
||||||
|
fail_count: usize,
|
||||||
|
|
||||||
|
/// How many consecutive failures before we break the circuit
|
||||||
|
fail_threshold: usize,
|
||||||
|
|
||||||
|
/// If circuit is broken, when was it broken?
|
||||||
|
broken_at: Option<Instant>,
|
||||||
|
|
||||||
|
/// If set, we will auto-reset the circuit this long after it was broken. If None, broken
|
||||||
|
/// circuits stay broken forever, or until success() is called.
|
||||||
|
reset_period: Option<Duration>,
|
||||||
|
|
||||||
|
/// If this is true, no actual circuit-breaking happens. This is for overriding a circuit breaker
|
||||||
|
/// to permit something to keep running even if it would otherwise have tripped it.
|
||||||
|
short_circuit: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CircuitBreaker {
|
||||||
|
pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
|
||||||
|
Self {
|
||||||
|
name,
|
||||||
|
fail_count: 0,
|
||||||
|
fail_threshold,
|
||||||
|
broken_at: None,
|
||||||
|
reset_period,
|
||||||
|
short_circuit: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct an unbreakable circuit breaker, for use in unit tests etc.
|
||||||
|
pub fn short_circuit() -> Self {
|
||||||
|
Self {
|
||||||
|
name: String::new(),
|
||||||
|
fail_threshold: 0,
|
||||||
|
fail_count: 0,
|
||||||
|
broken_at: None,
|
||||||
|
reset_period: None,
|
||||||
|
short_circuit: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
|
||||||
|
where
|
||||||
|
E: Display,
|
||||||
|
{
|
||||||
|
if self.short_circuit {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.fail_count += 1;
|
||||||
|
if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
|
||||||
|
self.break_circuit(metric, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Call this after successfully executing an operation
|
||||||
|
pub fn success(&mut self, metric: &IntCounter) {
|
||||||
|
self.fail_count = 0;
|
||||||
|
if let Some(broken_at) = &self.broken_at {
|
||||||
|
tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
|
||||||
|
humantime::format_duration(broken_at.elapsed()));
|
||||||
|
self.broken_at = None;
|
||||||
|
metric.inc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Call this before attempting an operation, and skip the operation if we are currently broken.
|
||||||
|
pub fn is_broken(&mut self) -> bool {
|
||||||
|
if self.short_circuit {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(broken_at) = self.broken_at {
|
||||||
|
match self.reset_period {
|
||||||
|
Some(reset_period) if broken_at.elapsed() > reset_period => {
|
||||||
|
self.reset_circuit();
|
||||||
|
false
|
||||||
|
}
|
||||||
|
_ => true,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
|
||||||
|
where
|
||||||
|
E: Display,
|
||||||
|
{
|
||||||
|
self.broken_at = Some(Instant::now());
|
||||||
|
tracing::error!(breaker=%self.name, "Circuit breaker broken! Last error: {error}");
|
||||||
|
metric.inc();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reset_circuit(&mut self) {
|
||||||
|
self.broken_at = None;
|
||||||
|
self.fail_count = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -52,17 +52,17 @@ struct RequestId(String);
|
|||||||
/// There could be other ways to implement similar functionality:
|
/// There could be other ways to implement similar functionality:
|
||||||
///
|
///
|
||||||
/// * procmacros placed on top of all handler methods
|
/// * procmacros placed on top of all handler methods
|
||||||
/// With all the drawbacks of procmacros, brings no difference implementation-wise,
|
/// With all the drawbacks of procmacros, brings no difference implementation-wise,
|
||||||
/// and little code reduction compared to the existing approach.
|
/// and little code reduction compared to the existing approach.
|
||||||
///
|
///
|
||||||
/// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
|
/// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
|
||||||
/// implemented for [`RouterBuilder`].
|
/// implemented for [`RouterBuilder`].
|
||||||
/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
|
/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
|
||||||
///
|
///
|
||||||
/// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
|
/// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
|
||||||
/// later, in a post-response middleware.
|
/// later, in a post-response middleware.
|
||||||
/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
|
/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
|
||||||
/// tries to achive with its `.instrument` used in the current approach.
|
/// tries to achive with its `.instrument` used in the current approach.
|
||||||
///
|
///
|
||||||
/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
|
/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
|
||||||
pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
|
pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
|
||||||
|
|||||||
@@ -74,6 +74,15 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
|
|||||||
.transpose()
|
.transpose()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
|
||||||
|
request: &Request<Body>,
|
||||||
|
param_name: &str,
|
||||||
|
) -> Result<T, ApiError> {
|
||||||
|
parse_query_param(request, param_name)?.ok_or_else(|| {
|
||||||
|
ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
|
pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
|
||||||
match request.body_mut().data().await {
|
match request.body_mut().data().await {
|
||||||
Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
|
Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
|
||||||
|
|||||||
@@ -302,17 +302,6 @@ pub struct TenantId(Id);
|
|||||||
|
|
||||||
id_newtype!(TenantId);
|
id_newtype!(TenantId);
|
||||||
|
|
||||||
/// Neon Connection Id identifies long-lived connections (for example a pagestream
|
|
||||||
/// connection with the page_service). Is used for better logging and tracing
|
|
||||||
///
|
|
||||||
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
|
|
||||||
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
|
|
||||||
/// See [`Id`] for alternative ways to serialize it.
|
|
||||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
|
|
||||||
pub struct ConnectionId(Id);
|
|
||||||
|
|
||||||
id_newtype!(ConnectionId);
|
|
||||||
|
|
||||||
// A pair uniquely identifying Neon instance.
|
// A pair uniquely identifying Neon instance.
|
||||||
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
pub struct TenantTimelineId {
|
pub struct TenantTimelineId {
|
||||||
|
|||||||
@@ -26,6 +26,8 @@ pub mod auth;
|
|||||||
// utility functions and helper traits for unified unique id generation/serialization etc.
|
// utility functions and helper traits for unified unique id generation/serialization etc.
|
||||||
pub mod id;
|
pub mod id;
|
||||||
|
|
||||||
|
pub mod shard;
|
||||||
|
|
||||||
mod hex;
|
mod hex;
|
||||||
pub use hex::Hex;
|
pub use hex::Hex;
|
||||||
|
|
||||||
@@ -94,6 +96,10 @@ pub mod env;
|
|||||||
|
|
||||||
pub mod poison;
|
pub mod poison;
|
||||||
|
|
||||||
|
pub mod toml_edit_ext;
|
||||||
|
|
||||||
|
pub mod circuit_breaker;
|
||||||
|
|
||||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||||
///
|
///
|
||||||
/// we have several cases:
|
/// we have several cases:
|
||||||
|
|||||||
451
libs/utils/src/shard.rs
Normal file
451
libs/utils/src/shard.rs
Normal file
@@ -0,0 +1,451 @@
|
|||||||
|
//! See `pageserver_api::shard` for description on sharding.
|
||||||
|
|
||||||
|
use std::{ops::RangeInclusive, str::FromStr};
|
||||||
|
|
||||||
|
use hex::FromHex;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::id::TenantId;
|
||||||
|
|
||||||
|
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||||
|
pub struct ShardNumber(pub u8);
|
||||||
|
|
||||||
|
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||||
|
pub struct ShardCount(pub u8);
|
||||||
|
|
||||||
|
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
||||||
|
/// when we need to know which shard we're dealing with, but do not need to know the full
|
||||||
|
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
||||||
|
/// the fully qualified TenantShardId.
|
||||||
|
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||||
|
pub struct ShardIndex {
|
||||||
|
pub shard_number: ShardNumber,
|
||||||
|
pub shard_count: ShardCount,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Formatting helper, for generating the `shard_id` label in traces.
|
||||||
|
pub struct ShardSlug<'a>(&'a TenantShardId);
|
||||||
|
|
||||||
|
/// TenantShardId globally identifies a particular shard in a particular tenant.
|
||||||
|
///
|
||||||
|
/// These are written as `<TenantId>-<ShardSlug>`, for example:
|
||||||
|
/// # The second shard in a two-shard tenant
|
||||||
|
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||||
|
///
|
||||||
|
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
|
||||||
|
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
|
||||||
|
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
|
||||||
|
///
|
||||||
|
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
|
||||||
|
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
|
||||||
|
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||||
|
/// as a TenantId.
|
||||||
|
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||||
|
pub struct TenantShardId {
|
||||||
|
pub tenant_id: TenantId,
|
||||||
|
pub shard_number: ShardNumber,
|
||||||
|
pub shard_count: ShardCount,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ShardCount {
|
||||||
|
pub const MAX: Self = Self(u8::MAX);
|
||||||
|
|
||||||
|
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||||
|
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||||
|
/// as [`TenantShardId::unsharded`].
|
||||||
|
///
|
||||||
|
/// This method returns the actual number of shards, i.e. if our internal value is
|
||||||
|
/// zero, we return 1 (unsharded tenants have 1 shard).
|
||||||
|
pub fn count(&self) -> u8 {
|
||||||
|
if self.0 > 0 {
|
||||||
|
self.0
|
||||||
|
} else {
|
||||||
|
1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The literal internal value: this is **not** the number of shards in the
|
||||||
|
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
|
||||||
|
/// [`Self::count`] if you want to know the cardinality of shards.
|
||||||
|
pub fn literal(&self) -> u8 {
|
||||||
|
self.0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
|
||||||
|
/// uses the legacy format for `TenantShardId`. See also the documentation for
|
||||||
|
/// [`Self::count`].
|
||||||
|
pub fn is_unsharded(&self) -> bool {
|
||||||
|
self.0 == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
||||||
|
/// [`Self::literal`] would return.
|
||||||
|
pub const fn new(val: u8) -> Self {
|
||||||
|
Self(val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ShardNumber {
|
||||||
|
pub const MAX: Self = Self(u8::MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TenantShardId {
|
||||||
|
pub fn unsharded(tenant_id: TenantId) -> Self {
|
||||||
|
Self {
|
||||||
|
tenant_id,
|
||||||
|
shard_number: ShardNumber(0),
|
||||||
|
shard_count: ShardCount(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The range of all TenantShardId that belong to a particular TenantId. This is useful when
|
||||||
|
/// you have a BTreeMap of TenantShardId, and are querying by TenantId.
|
||||||
|
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
||||||
|
RangeInclusive::new(
|
||||||
|
Self {
|
||||||
|
tenant_id,
|
||||||
|
shard_number: ShardNumber(0),
|
||||||
|
shard_count: ShardCount(0),
|
||||||
|
},
|
||||||
|
Self {
|
||||||
|
tenant_id,
|
||||||
|
shard_number: ShardNumber::MAX,
|
||||||
|
shard_count: ShardCount::MAX,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
|
||||||
|
ShardSlug(self)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience for code that has special behavior on the 0th shard.
|
||||||
|
pub fn is_shard_zero(&self) -> bool {
|
||||||
|
self.shard_number == ShardNumber(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||||
|
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||||
|
/// a shard suffix.
|
||||||
|
pub fn is_unsharded(&self) -> bool {
|
||||||
|
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
|
||||||
|
/// is useful when logging from code that is already in a span that includes tenant ID, to
|
||||||
|
/// keep messages reasonably terse.
|
||||||
|
pub fn to_index(&self) -> ShardIndex {
|
||||||
|
ShardIndex {
|
||||||
|
shard_number: self.shard_number,
|
||||||
|
shard_count: self.shard_count,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate the children of this TenantShardId when splitting the overall tenant into
|
||||||
|
/// the given number of shards.
|
||||||
|
pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
|
||||||
|
let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
|
||||||
|
let mut child_shards = Vec::new();
|
||||||
|
for shard_number in 0..ShardNumber(new_shard_count.0).0 {
|
||||||
|
// Key mapping is based on a round robin mapping of key hash modulo shard count,
|
||||||
|
// so our child shards are the ones which the same keys would map to.
|
||||||
|
if shard_number % effective_old_shard_count == self.shard_number.0 {
|
||||||
|
child_shards.push(TenantShardId {
|
||||||
|
tenant_id: self.tenant_id,
|
||||||
|
shard_number: ShardNumber(shard_number),
|
||||||
|
shard_count: new_shard_count,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
child_shards
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"{:02x}{:02x}",
|
||||||
|
self.0.shard_number.0, self.0.shard_count.0
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for TenantShardId {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
if self.shard_count != ShardCount(0) {
|
||||||
|
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
|
||||||
|
} else {
|
||||||
|
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
|
||||||
|
// is distinct from the normal single shard case (shard count == 1).
|
||||||
|
self.tenant_id.fmt(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for TenantShardId {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
// Debug is the same as Display: the compact hex representation
|
||||||
|
write!(f, "{}", self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::str::FromStr for TenantShardId {
|
||||||
|
type Err = hex::FromHexError;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
// Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
|
||||||
|
if s.len() == 32 {
|
||||||
|
// Legacy case: no shard specified
|
||||||
|
Ok(Self {
|
||||||
|
tenant_id: TenantId::from_str(s)?,
|
||||||
|
shard_number: ShardNumber(0),
|
||||||
|
shard_count: ShardCount(0),
|
||||||
|
})
|
||||||
|
} else if s.len() == 37 {
|
||||||
|
let bytes = s.as_bytes();
|
||||||
|
let tenant_id = TenantId::from_hex(&bytes[0..32])?;
|
||||||
|
let mut shard_parts: [u8; 2] = [0u8; 2];
|
||||||
|
hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
|
||||||
|
Ok(Self {
|
||||||
|
tenant_id,
|
||||||
|
shard_number: ShardNumber(shard_parts[0]),
|
||||||
|
shard_count: ShardCount(shard_parts[1]),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
Err(hex::FromHexError::InvalidStringLength)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<[u8; 18]> for TenantShardId {
|
||||||
|
fn from(b: [u8; 18]) -> Self {
|
||||||
|
let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
|
||||||
|
|
||||||
|
Self {
|
||||||
|
tenant_id: TenantId::from(tenant_id_bytes),
|
||||||
|
shard_number: ShardNumber(b[16]),
|
||||||
|
shard_count: ShardCount(b[17]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ShardIndex {
|
||||||
|
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||||
|
Self {
|
||||||
|
shard_number: number,
|
||||||
|
shard_count: count,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn unsharded() -> Self {
|
||||||
|
Self {
|
||||||
|
shard_number: ShardNumber(0),
|
||||||
|
shard_count: ShardCount(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||||
|
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||||
|
/// a shard suffix.
|
||||||
|
pub fn is_unsharded(&self) -> bool {
|
||||||
|
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// For use in constructing remote storage paths: concatenate this with a TenantId
|
||||||
|
/// to get a fully qualified TenantShardId.
|
||||||
|
///
|
||||||
|
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
|
||||||
|
/// that the legacy pre-sharding remote key format is preserved.
|
||||||
|
pub fn get_suffix(&self) -> String {
|
||||||
|
if self.is_unsharded() {
|
||||||
|
"".to_string()
|
||||||
|
} else {
|
||||||
|
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for ShardIndex {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for ShardIndex {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
// Debug is the same as Display: the compact hex representation
|
||||||
|
write!(f, "{}", self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::str::FromStr for ShardIndex {
|
||||||
|
type Err = hex::FromHexError;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
// Expect format: 1 byte shard number, 1 byte shard count
|
||||||
|
if s.len() == 4 {
|
||||||
|
let bytes = s.as_bytes();
|
||||||
|
let mut shard_parts: [u8; 2] = [0u8; 2];
|
||||||
|
hex::decode_to_slice(bytes, &mut shard_parts)?;
|
||||||
|
Ok(Self {
|
||||||
|
shard_number: ShardNumber(shard_parts[0]),
|
||||||
|
shard_count: ShardCount(shard_parts[1]),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
Err(hex::FromHexError::InvalidStringLength)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<[u8; 2]> for ShardIndex {
|
||||||
|
fn from(b: [u8; 2]) -> Self {
|
||||||
|
Self {
|
||||||
|
shard_number: ShardNumber(b[0]),
|
||||||
|
shard_count: ShardCount(b[1]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Serialize for TenantShardId {
|
||||||
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||||
|
where
|
||||||
|
S: serde::Serializer,
|
||||||
|
{
|
||||||
|
if serializer.is_human_readable() {
|
||||||
|
serializer.collect_str(self)
|
||||||
|
} else {
|
||||||
|
// Note: while human encoding of [`TenantShardId`] is backward and forward
|
||||||
|
// compatible, this binary encoding is not.
|
||||||
|
let mut packed: [u8; 18] = [0; 18];
|
||||||
|
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
||||||
|
packed[16] = self.shard_number.0;
|
||||||
|
packed[17] = self.shard_count.0;
|
||||||
|
|
||||||
|
packed.serialize(serializer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de> Deserialize<'de> for TenantShardId {
|
||||||
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
struct IdVisitor {
|
||||||
|
is_human_readable_deserializer: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
||||||
|
type Value = TenantShardId;
|
||||||
|
|
||||||
|
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
if self.is_human_readable_deserializer {
|
||||||
|
formatter.write_str("value in form of hex string")
|
||||||
|
} else {
|
||||||
|
formatter.write_str("value in form of integer array([u8; 18])")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||||
|
where
|
||||||
|
A: serde::de::SeqAccess<'de>,
|
||||||
|
{
|
||||||
|
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||||
|
let id: [u8; 18] = Deserialize::deserialize(s)?;
|
||||||
|
Ok(TenantShardId::from(id))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||||
|
where
|
||||||
|
E: serde::de::Error,
|
||||||
|
{
|
||||||
|
TenantShardId::from_str(v).map_err(E::custom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if deserializer.is_human_readable() {
|
||||||
|
deserializer.deserialize_str(IdVisitor {
|
||||||
|
is_human_readable_deserializer: true,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
deserializer.deserialize_tuple(
|
||||||
|
18,
|
||||||
|
IdVisitor {
|
||||||
|
is_human_readable_deserializer: false,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Serialize for ShardIndex {
|
||||||
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||||
|
where
|
||||||
|
S: serde::Serializer,
|
||||||
|
{
|
||||||
|
if serializer.is_human_readable() {
|
||||||
|
serializer.collect_str(self)
|
||||||
|
} else {
|
||||||
|
// Binary encoding is not used in index_part.json, but is included in anticipation of
|
||||||
|
// switching various structures (e.g. inter-process communication, remote metadata) to more
|
||||||
|
// compact binary encodings in future.
|
||||||
|
let mut packed: [u8; 2] = [0; 2];
|
||||||
|
packed[0] = self.shard_number.0;
|
||||||
|
packed[1] = self.shard_count.0;
|
||||||
|
packed.serialize(serializer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de> Deserialize<'de> for ShardIndex {
|
||||||
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
struct IdVisitor {
|
||||||
|
is_human_readable_deserializer: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
||||||
|
type Value = ShardIndex;
|
||||||
|
|
||||||
|
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
if self.is_human_readable_deserializer {
|
||||||
|
formatter.write_str("value in form of hex string")
|
||||||
|
} else {
|
||||||
|
formatter.write_str("value in form of integer array([u8; 2])")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||||
|
where
|
||||||
|
A: serde::de::SeqAccess<'de>,
|
||||||
|
{
|
||||||
|
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||||
|
let id: [u8; 2] = Deserialize::deserialize(s)?;
|
||||||
|
Ok(ShardIndex::from(id))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||||
|
where
|
||||||
|
E: serde::de::Error,
|
||||||
|
{
|
||||||
|
ShardIndex::from_str(v).map_err(E::custom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if deserializer.is_human_readable() {
|
||||||
|
deserializer.deserialize_str(IdVisitor {
|
||||||
|
is_human_readable_deserializer: true,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
deserializer.deserialize_tuple(
|
||||||
|
2,
|
||||||
|
IdVisitor {
|
||||||
|
is_human_readable_deserializer: false,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
22
libs/utils/src/toml_edit_ext.rs
Normal file
22
libs/utils/src/toml_edit_ext.rs
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum Error {
|
||||||
|
#[error("item is not a document")]
|
||||||
|
ItemIsNotADocument,
|
||||||
|
#[error(transparent)]
|
||||||
|
Serde(toml_edit::de::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
|
||||||
|
where
|
||||||
|
T: serde::de::DeserializeOwned,
|
||||||
|
{
|
||||||
|
let document: toml_edit::Document = match item {
|
||||||
|
toml_edit::Item::Table(toml) => toml.clone().into(),
|
||||||
|
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
|
||||||
|
toml.clone().into_table().into()
|
||||||
|
}
|
||||||
|
_ => return Err(Error::ItemIsNotADocument),
|
||||||
|
};
|
||||||
|
|
||||||
|
toml_edit::de::from_document(document).map_err(Error::Serde)
|
||||||
|
}
|
||||||
@@ -62,6 +62,7 @@ sync_wrapper.workspace = true
|
|||||||
sysinfo.workspace = true
|
sysinfo.workspace = true
|
||||||
tokio-tar.workspace = true
|
tokio-tar.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
|
tikv-jemallocator.workspace = true
|
||||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||||
tokio-epoll-uring.workspace = true
|
tokio-epoll-uring.workspace = true
|
||||||
tokio-io-timeout.workspace = true
|
tokio-io-timeout.workspace = true
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ license.workspace = true
|
|||||||
pageserver_api.workspace = true
|
pageserver_api.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
reqwest.workspace = true
|
reqwest = { workspace = true, features = [ "stream" ] }
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ use utils::{
|
|||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
pub use reqwest::Body as ReqwestBody;
|
||||||
|
|
||||||
pub mod util;
|
pub mod util;
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -20,6 +22,9 @@ pub struct Client {
|
|||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
#[derive(thiserror::Error, Debug)]
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
|
#[error("send request: {0}")]
|
||||||
|
SendRequest(reqwest::Error),
|
||||||
|
|
||||||
#[error("receive body: {0}")]
|
#[error("receive body: {0}")]
|
||||||
ReceiveBody(reqwest::Error),
|
ReceiveBody(reqwest::Error),
|
||||||
|
|
||||||
@@ -173,19 +178,30 @@ impl Client {
|
|||||||
self.request(Method::GET, uri, ()).await
|
self.request(Method::GET, uri, ()).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn start_request<U: reqwest::IntoUrl>(
|
||||||
|
&self,
|
||||||
|
method: Method,
|
||||||
|
uri: U,
|
||||||
|
) -> reqwest::RequestBuilder {
|
||||||
|
let req = self.client.request(method, uri);
|
||||||
|
if let Some(value) = &self.authorization_header {
|
||||||
|
req.header(reqwest::header::AUTHORIZATION, value)
|
||||||
|
} else {
|
||||||
|
req
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
|
async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||||
&self,
|
&self,
|
||||||
method: Method,
|
method: Method,
|
||||||
uri: U,
|
uri: U,
|
||||||
body: B,
|
body: B,
|
||||||
) -> Result<reqwest::Response> {
|
) -> Result<reqwest::Response> {
|
||||||
let req = self.client.request(method, uri);
|
self.start_request(method, uri)
|
||||||
let req = if let Some(value) = &self.authorization_header {
|
.json(&body)
|
||||||
req.header(reqwest::header::AUTHORIZATION, value)
|
.send()
|
||||||
} else {
|
.await
|
||||||
req
|
.map_err(Error::ReceiveBody)
|
||||||
};
|
|
||||||
req.json(&body).send().await.map_err(Error::ReceiveBody)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||||
@@ -609,4 +625,53 @@ impl Client {
|
|||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn import_basebackup(
|
||||||
|
&self,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
base_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
pg_version: u32,
|
||||||
|
basebackup_tarball: ReqwestBody,
|
||||||
|
) -> Result<()> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
|
||||||
|
self.mgmt_api_endpoint,
|
||||||
|
);
|
||||||
|
self.start_request(Method::PUT, uri)
|
||||||
|
.body(basebackup_tarball)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(Error::SendRequest)?
|
||||||
|
.error_from_body()
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn import_wal(
|
||||||
|
&self,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
start_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
wal_tarball: ReqwestBody,
|
||||||
|
) -> Result<()> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
|
||||||
|
self.mgmt_api_endpoint,
|
||||||
|
);
|
||||||
|
self.start_request(Method::PUT, uri)
|
||||||
|
.body(wal_tarball)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(Error::SendRequest)?
|
||||||
|
.error_from_body()
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(Error::ReceiveBody)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ impl CompactionKey for Key {
|
|||||||
pub type CompactionKeySpace<K> = Vec<Range<K>>;
|
pub type CompactionKeySpace<K> = Vec<Range<K>>;
|
||||||
|
|
||||||
/// Functions needed from all layers.
|
/// Functions needed from all layers.
|
||||||
pub trait CompactionLayer<K: CompactionKey + ?Sized> {
|
pub trait CompactionLayer<K: CompactionKey> {
|
||||||
fn key_range(&self) -> &Range<K>;
|
fn key_range(&self) -> &Range<K>;
|
||||||
fn lsn_range(&self) -> &Range<Lsn>;
|
fn lsn_range(&self) -> &Range<Lsn>;
|
||||||
|
|
||||||
|
|||||||
@@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
let toml_item = toml_document
|
let toml_item = toml_document
|
||||||
.get("remote_storage")
|
.get("remote_storage")
|
||||||
.expect("need remote_storage");
|
.expect("need remote_storage");
|
||||||
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
|
let config = RemoteStorageConfig::from_toml(toml_item)?;
|
||||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
|
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
|
||||||
let cancel = CancellationToken::new();
|
let cancel = CancellationToken::new();
|
||||||
storage
|
storage
|
||||||
|
|||||||
@@ -348,35 +348,36 @@ where
|
|||||||
self.add_rel(rel, rel).await?;
|
self.add_rel(rel, rel).await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (path, content) in self
|
|
||||||
.timeline
|
|
||||||
.list_aux_files(self.lsn, self.ctx)
|
|
||||||
.await
|
|
||||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
|
||||||
{
|
|
||||||
if path.starts_with("pg_replslot") {
|
|
||||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
|
||||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
|
||||||
content[offs..offs + 8].try_into().unwrap(),
|
|
||||||
));
|
|
||||||
info!("Replication slot {} restart LSN={}", path, restart_lsn);
|
|
||||||
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
|
|
||||||
} else if path == "pg_logical/replorigin_checkpoint" {
|
|
||||||
// replorigin_checkoint is written only on compute shutdown, so it contains
|
|
||||||
// deteriorated values. So we generate our own version of this file for the particular LSN
|
|
||||||
// based on information about replorigins extracted from transaction commit records.
|
|
||||||
// In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
|
|
||||||
// but now we should handle (skip) it for backward compatibility.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let header = new_tar_header(&path, content.len() as u64)?;
|
|
||||||
self.ar
|
|
||||||
.append(&header, &*content)
|
|
||||||
.await
|
|
||||||
.context("could not add aux file to basebackup tarball")?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (path, content) in self
|
||||||
|
.timeline
|
||||||
|
.list_aux_files(self.lsn, self.ctx)
|
||||||
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||||
|
{
|
||||||
|
if path.starts_with("pg_replslot") {
|
||||||
|
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||||
|
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||||
|
content[offs..offs + 8].try_into().unwrap(),
|
||||||
|
));
|
||||||
|
info!("Replication slot {} restart LSN={}", path, restart_lsn);
|
||||||
|
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
|
||||||
|
} else if path == "pg_logical/replorigin_checkpoint" {
|
||||||
|
// replorigin_checkoint is written only on compute shutdown, so it contains
|
||||||
|
// deteriorated values. So we generate our own version of this file for the particular LSN
|
||||||
|
// based on information about replorigins extracted from transaction commit records.
|
||||||
|
// In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
|
||||||
|
// but now we should handle (skip) it for backward compatibility.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let header = new_tar_header(&path, content.len() as u64)?;
|
||||||
|
self.ar
|
||||||
|
.append(&header, &*content)
|
||||||
|
.await
|
||||||
|
.context("could not add aux file to basebackup tarball")?;
|
||||||
|
}
|
||||||
|
|
||||||
if min_restart_lsn != Lsn::MAX {
|
if min_restart_lsn != Lsn::MAX {
|
||||||
info!(
|
info!(
|
||||||
"Min restart LSN for logical replication is {}",
|
"Min restart LSN for logical replication is {}",
|
||||||
|
|||||||
@@ -47,6 +47,9 @@ use utils::{
|
|||||||
project_git_version!(GIT_VERSION);
|
project_git_version!(GIT_VERSION);
|
||||||
project_build_tag!(BUILD_TAG);
|
project_build_tag!(BUILD_TAG);
|
||||||
|
|
||||||
|
#[global_allocator]
|
||||||
|
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||||
|
|
||||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||||
|
|
||||||
const FEATURES: &[&str] = &[
|
const FEATURES: &[&str] = &[
|
||||||
@@ -421,6 +424,10 @@ fn start_pageserver(
|
|||||||
background_jobs_can_start: background_jobs_barrier.clone(),
|
background_jobs_can_start: background_jobs_barrier.clone(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
info!(config=?conf.l0_flush, "using l0_flush config");
|
||||||
|
let l0_flush_global_state =
|
||||||
|
pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
|
||||||
|
|
||||||
// Scan the local 'tenants/' directory and start loading the tenants
|
// Scan the local 'tenants/' directory and start loading the tenants
|
||||||
let deletion_queue_client = deletion_queue.new_client();
|
let deletion_queue_client = deletion_queue.new_client();
|
||||||
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||||
@@ -429,6 +436,7 @@ fn start_pageserver(
|
|||||||
broker_client: broker_client.clone(),
|
broker_client: broker_client.clone(),
|
||||||
remote_storage: remote_storage.clone(),
|
remote_storage: remote_storage.clone(),
|
||||||
deletion_queue_client,
|
deletion_queue_client,
|
||||||
|
l0_flush_global_state,
|
||||||
},
|
},
|
||||||
order,
|
order,
|
||||||
shutdown_pageserver.clone(),
|
shutdown_pageserver.clone(),
|
||||||
@@ -652,7 +660,6 @@ fn start_pageserver(
|
|||||||
async move {
|
async move {
|
||||||
page_service::libpq_listener_main(
|
page_service::libpq_listener_main(
|
||||||
tenant_manager,
|
tenant_manager,
|
||||||
broker_client,
|
|
||||||
pg_auth,
|
pg_auth,
|
||||||
pageserver_listener,
|
pageserver_listener,
|
||||||
conf.pg_auth_type,
|
conf.pg_auth_type,
|
||||||
|
|||||||
@@ -5,14 +5,13 @@
|
|||||||
//! See also `settings.md` for better description on every parameter.
|
//! See also `settings.md` for better description on every parameter.
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
|
||||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||||
use serde;
|
use serde;
|
||||||
use serde::de::IntoDeserializer;
|
use serde::de::IntoDeserializer;
|
||||||
use std::env;
|
use std::env;
|
||||||
use storage_broker::Uri;
|
use storage_broker::Uri;
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::id::ConnectionId;
|
|
||||||
use utils::logging::SecretString;
|
use utils::logging::SecretString;
|
||||||
|
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
@@ -30,11 +29,11 @@ use utils::{
|
|||||||
logging::LogFormat,
|
logging::LogFormat,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::tenant::timeline::GetVectoredImpl;
|
|
||||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||||
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
||||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||||
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
||||||
|
use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
|
||||||
use crate::{tenant::config::TenantConf, virtual_file};
|
use crate::{tenant::config::TenantConf, virtual_file};
|
||||||
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
|
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
|
||||||
|
|
||||||
@@ -50,6 +49,7 @@ pub mod defaults {
|
|||||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||||
DEFAULT_PG_LISTEN_PORT,
|
DEFAULT_PG_LISTEN_PORT,
|
||||||
};
|
};
|
||||||
|
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||||
|
|
||||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||||
@@ -90,6 +90,9 @@ pub mod defaults {
|
|||||||
|
|
||||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||||
|
|
||||||
|
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||||
|
ImageCompressionAlgorithm::Disabled;
|
||||||
|
|
||||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||||
|
|
||||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||||
@@ -159,7 +162,7 @@ pub mod defaults {
|
|||||||
|
|
||||||
#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
|
#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
|
||||||
|
|
||||||
[remote_storage]
|
#[remote_storage]
|
||||||
|
|
||||||
"#
|
"#
|
||||||
);
|
);
|
||||||
@@ -285,12 +288,16 @@ pub struct PageServerConf {
|
|||||||
|
|
||||||
pub validate_vectored_get: bool,
|
pub validate_vectored_get: bool,
|
||||||
|
|
||||||
|
pub image_compression: ImageCompressionAlgorithm,
|
||||||
|
|
||||||
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
|
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
|
||||||
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
|
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
|
||||||
/// of ephemeral data.
|
/// of ephemeral data.
|
||||||
///
|
///
|
||||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||||
pub ephemeral_bytes_per_memory_kb: usize,
|
pub ephemeral_bytes_per_memory_kb: usize,
|
||||||
|
|
||||||
|
pub l0_flush: L0FlushConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||||
@@ -395,7 +402,11 @@ struct PageServerConfigBuilder {
|
|||||||
|
|
||||||
validate_vectored_get: BuilderValue<bool>,
|
validate_vectored_get: BuilderValue<bool>,
|
||||||
|
|
||||||
|
image_compression: BuilderValue<ImageCompressionAlgorithm>,
|
||||||
|
|
||||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||||
|
|
||||||
|
l0_flush: BuilderValue<L0FlushConfig>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PageServerConfigBuilder {
|
impl PageServerConfigBuilder {
|
||||||
@@ -482,8 +493,10 @@ impl PageServerConfigBuilder {
|
|||||||
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
||||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||||
)),
|
)),
|
||||||
|
image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
|
||||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||||
|
l0_flush: Set(L0FlushConfig::default()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -667,10 +680,18 @@ impl PageServerConfigBuilder {
|
|||||||
self.validate_vectored_get = BuilderValue::Set(value);
|
self.validate_vectored_get = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
|
||||||
|
self.image_compression = BuilderValue::Set(value);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
|
pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
|
||||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn l0_flush(&mut self, value: L0FlushConfig) {
|
||||||
|
self.l0_flush = BuilderValue::Set(value);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||||
let default = Self::default_values();
|
let default = Self::default_values();
|
||||||
|
|
||||||
@@ -727,7 +748,9 @@ impl PageServerConfigBuilder {
|
|||||||
get_impl,
|
get_impl,
|
||||||
max_vectored_read_bytes,
|
max_vectored_read_bytes,
|
||||||
validate_vectored_get,
|
validate_vectored_get,
|
||||||
|
image_compression,
|
||||||
ephemeral_bytes_per_memory_kb,
|
ephemeral_bytes_per_memory_kb,
|
||||||
|
l0_flush,
|
||||||
}
|
}
|
||||||
CUSTOM LOGIC
|
CUSTOM LOGIC
|
||||||
{
|
{
|
||||||
@@ -846,22 +869,6 @@ impl PageServerConf {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn traces_path(&self) -> Utf8PathBuf {
|
|
||||||
self.workdir.join("traces")
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn trace_path(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: &TenantShardId,
|
|
||||||
timeline_id: &TimelineId,
|
|
||||||
connection_id: &ConnectionId,
|
|
||||||
) -> Utf8PathBuf {
|
|
||||||
self.traces_path()
|
|
||||||
.join(tenant_shard_id.to_string())
|
|
||||||
.join(timeline_id.to_string())
|
|
||||||
.join(connection_id.to_string())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Turns storage remote path of a file into its local path.
|
/// Turns storage remote path of a file into its local path.
|
||||||
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
|
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
|
||||||
remote_path.with_base(&self.workdir)
|
remote_path.with_base(&self.workdir)
|
||||||
@@ -918,7 +925,7 @@ impl PageServerConf {
|
|||||||
"http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
|
"http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
|
||||||
"pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
|
"pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
|
||||||
"remote_storage" => {
|
"remote_storage" => {
|
||||||
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
|
builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
|
||||||
}
|
}
|
||||||
"tenant_config" => {
|
"tenant_config" => {
|
||||||
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
|
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
|
||||||
@@ -946,7 +953,7 @@ impl PageServerConf {
|
|||||||
builder.metric_collection_endpoint(Some(endpoint));
|
builder.metric_collection_endpoint(Some(endpoint));
|
||||||
},
|
},
|
||||||
"metric_collection_bucket" => {
|
"metric_collection_bucket" => {
|
||||||
builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
|
builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
|
||||||
}
|
}
|
||||||
"synthetic_size_calculation_interval" =>
|
"synthetic_size_calculation_interval" =>
|
||||||
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
||||||
@@ -1004,9 +1011,15 @@ impl PageServerConf {
|
|||||||
"validate_vectored_get" => {
|
"validate_vectored_get" => {
|
||||||
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
||||||
}
|
}
|
||||||
|
"image_compression" => {
|
||||||
|
builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
|
||||||
|
}
|
||||||
"ephemeral_bytes_per_memory_kb" => {
|
"ephemeral_bytes_per_memory_kb" => {
|
||||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||||
}
|
}
|
||||||
|
"l0_flush" => {
|
||||||
|
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
|
||||||
|
}
|
||||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1088,8 +1101,10 @@ impl PageServerConf {
|
|||||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||||
.expect("Invalid default constant"),
|
.expect("Invalid default constant"),
|
||||||
),
|
),
|
||||||
|
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
|
l0_flush: L0FlushConfig::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1328,7 +1343,9 @@ background_task_maximum_delay = '334 s'
|
|||||||
.expect("Invalid default constant")
|
.expect("Invalid default constant")
|
||||||
),
|
),
|
||||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
|
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
|
l0_flush: L0FlushConfig::default(),
|
||||||
},
|
},
|
||||||
"Correct defaults should be used when no config values are provided"
|
"Correct defaults should be used when no config values are provided"
|
||||||
);
|
);
|
||||||
@@ -1401,7 +1418,9 @@ background_task_maximum_delay = '334 s'
|
|||||||
.expect("Invalid default constant")
|
.expect("Invalid default constant")
|
||||||
),
|
),
|
||||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
|
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
|
l0_flush: L0FlushConfig::default(),
|
||||||
},
|
},
|
||||||
"Should be able to parse all basic config values correctly"
|
"Should be able to parse all basic config values correctly"
|
||||||
);
|
);
|
||||||
@@ -1524,34 +1543,6 @@ broker_endpoint = '{broker_endpoint}'
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn parse_tenant_config() -> anyhow::Result<()> {
|
|
||||||
let tempdir = tempdir()?;
|
|
||||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
|
||||||
|
|
||||||
let broker_endpoint = "http://127.0.0.1:7777";
|
|
||||||
let trace_read_requests = true;
|
|
||||||
|
|
||||||
let config_string = format!(
|
|
||||||
r#"{ALL_BASE_VALUES_TOML}
|
|
||||||
pg_distrib_dir='{pg_distrib_dir}'
|
|
||||||
broker_endpoint = '{broker_endpoint}'
|
|
||||||
|
|
||||||
[tenant_config]
|
|
||||||
trace_read_requests = {trace_read_requests}"#,
|
|
||||||
);
|
|
||||||
|
|
||||||
let toml = config_string.parse()?;
|
|
||||||
|
|
||||||
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
|
|
||||||
assert_eq!(
|
|
||||||
conf.default_tenant_conf.trace_read_requests, trace_read_requests,
|
|
||||||
"Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
|
fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
|
||||||
let config_string = r#"
|
let config_string = r#"
|
||||||
@@ -1681,6 +1672,19 @@ threshold = "20m"
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_remote_storage_is_error() {
|
||||||
|
let tempdir = tempdir().unwrap();
|
||||||
|
let (workdir, _) = prepare_fs(&tempdir).unwrap();
|
||||||
|
let input = r#"
|
||||||
|
remote_storage = {}
|
||||||
|
"#;
|
||||||
|
let doc = toml_edit::Document::from_str(input).unwrap();
|
||||||
|
let err = PageServerConf::parse_and_validate(&doc, &workdir)
|
||||||
|
.expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
|
||||||
|
assert!(format!("{err}").contains("remote_storage"), "{err}");
|
||||||
|
}
|
||||||
|
|
||||||
fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
|
fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
|
||||||
let tempdir_path = tempdir.path();
|
let tempdir_path = tempdir.path();
|
||||||
|
|
||||||
|
|||||||
@@ -59,6 +59,7 @@
|
|||||||
//! 1. It should be easy to forward the context to callees.
|
//! 1. It should be easy to forward the context to callees.
|
||||||
//! 2. To propagate more data from high-level to low-level code, the functions in
|
//! 2. To propagate more data from high-level to low-level code, the functions in
|
||||||
//! the middle should not need to be modified.
|
//! the middle should not need to be modified.
|
||||||
|
//!
|
||||||
//! The solution is to have a container structure ([`RequestContext`]) that
|
//! The solution is to have a container structure ([`RequestContext`]) that
|
||||||
//! carries the information. Functions that don't care about what's in it
|
//! carries the information. Functions that don't care about what's in it
|
||||||
//! pass it along to callees.
|
//! pass it along to callees.
|
||||||
|
|||||||
@@ -190,7 +190,7 @@ where
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// If we failed validation, then do not apply any of the projected updates
|
// If we failed validation, then do not apply any of the projected updates
|
||||||
warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
|
info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
|
||||||
metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
|
metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -225,7 +225,7 @@ where
|
|||||||
&& (tenant.generation == *validated_generation);
|
&& (tenant.generation == *validated_generation);
|
||||||
|
|
||||||
if !this_list_valid {
|
if !this_list_valid {
|
||||||
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
|
info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
|
||||||
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
|
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
|
||||||
mutated = true;
|
mutated = true;
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -265,15 +265,19 @@ paths:
|
|||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
post:
|
post:
|
||||||
description: Obtain lease for the given LSN
|
description: Obtains a lease for the given LSN.
|
||||||
parameters:
|
requestBody:
|
||||||
- name: lsn
|
content:
|
||||||
in: query
|
application/json:
|
||||||
required: true
|
schema:
|
||||||
schema:
|
type: object
|
||||||
type: string
|
required:
|
||||||
format: hex
|
- lsn
|
||||||
description: A LSN to obtain the lease for
|
properties:
|
||||||
|
lsn:
|
||||||
|
description: A LSN to obtain the lease for.
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
responses:
|
responses:
|
||||||
"200":
|
"200":
|
||||||
description: OK
|
description: OK
|
||||||
@@ -869,8 +873,6 @@ components:
|
|||||||
type: string
|
type: string
|
||||||
max_lsn_wal_lag:
|
max_lsn_wal_lag:
|
||||||
type: integer
|
type: integer
|
||||||
trace_read_requests:
|
|
||||||
type: boolean
|
|
||||||
heatmap_period:
|
heatmap_period:
|
||||||
type: string
|
type: string
|
||||||
TenantConfigResponse:
|
TenantConfigResponse:
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use std::time::Duration;
|
|||||||
|
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use enumset::EnumSet;
|
use enumset::EnumSet;
|
||||||
|
use futures::StreamExt;
|
||||||
use futures::TryFutureExt;
|
use futures::TryFutureExt;
|
||||||
use humantime::format_rfc3339;
|
use humantime::format_rfc3339;
|
||||||
use hyper::header;
|
use hyper::header;
|
||||||
@@ -22,6 +23,7 @@ use pageserver_api::models::ListAuxFilesRequest;
|
|||||||
use pageserver_api::models::LocationConfig;
|
use pageserver_api::models::LocationConfig;
|
||||||
use pageserver_api::models::LocationConfigListResponse;
|
use pageserver_api::models::LocationConfigListResponse;
|
||||||
use pageserver_api::models::LsnLease;
|
use pageserver_api::models::LsnLease;
|
||||||
|
use pageserver_api::models::LsnLeaseRequest;
|
||||||
use pageserver_api::models::ShardParameters;
|
use pageserver_api::models::ShardParameters;
|
||||||
use pageserver_api::models::TenantDetails;
|
use pageserver_api::models::TenantDetails;
|
||||||
use pageserver_api::models::TenantLocationConfigResponse;
|
use pageserver_api::models::TenantLocationConfigResponse;
|
||||||
@@ -42,13 +44,15 @@ use pageserver_api::shard::TenantShardId;
|
|||||||
use remote_storage::DownloadError;
|
use remote_storage::DownloadError;
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use remote_storage::TimeTravelError;
|
use remote_storage::TimeTravelError;
|
||||||
use tenant_size_model::{SizeResult, StorageModel};
|
use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
|
||||||
|
use tokio_util::io::StreamReader;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::auth::JwtAuth;
|
use utils::auth::JwtAuth;
|
||||||
use utils::failpoint_support::failpoints_handler;
|
use utils::failpoint_support::failpoints_handler;
|
||||||
use utils::http::endpoint::prometheus_metrics_handler;
|
use utils::http::endpoint::prometheus_metrics_handler;
|
||||||
use utils::http::endpoint::request_span;
|
use utils::http::endpoint::request_span;
|
||||||
|
use utils::http::request::must_parse_query_param;
|
||||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||||
|
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
@@ -227,7 +231,7 @@ impl From<UpsertLocationError> for ApiError {
|
|||||||
BadRequest(e) => ApiError::BadRequest(e),
|
BadRequest(e) => ApiError::BadRequest(e),
|
||||||
Unavailable(_) => ApiError::ShuttingDown,
|
Unavailable(_) => ApiError::ShuttingDown,
|
||||||
e @ InProgress => ApiError::Conflict(format!("{e}")),
|
e @ InProgress => ApiError::Conflict(format!("{e}")),
|
||||||
Flush(e) | Other(e) => ApiError::InternalServerError(e),
|
Flush(e) | InternalError(e) => ApiError::InternalServerError(e),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -406,6 +410,8 @@ async fn build_timeline_info_common(
|
|||||||
|
|
||||||
let walreceiver_status = timeline.walreceiver_status();
|
let walreceiver_status = timeline.walreceiver_status();
|
||||||
|
|
||||||
|
let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
|
||||||
|
|
||||||
let info = TimelineInfo {
|
let info = TimelineInfo {
|
||||||
tenant_id: timeline.tenant_shard_id,
|
tenant_id: timeline.tenant_shard_id,
|
||||||
timeline_id: timeline.timeline_id,
|
timeline_id: timeline.timeline_id,
|
||||||
@@ -426,6 +432,8 @@ async fn build_timeline_info_common(
|
|||||||
directory_entries_counts: timeline.get_directory_metrics().to_vec(),
|
directory_entries_counts: timeline.get_directory_metrics().to_vec(),
|
||||||
current_physical_size,
|
current_physical_size,
|
||||||
current_logical_size_non_incremental: None,
|
current_logical_size_non_incremental: None,
|
||||||
|
pitr_history_size,
|
||||||
|
within_ancestor_pitr,
|
||||||
timeline_dir_layer_file_size_sum: None,
|
timeline_dir_layer_file_size_sum: None,
|
||||||
wal_source_connstr,
|
wal_source_connstr,
|
||||||
last_received_msg_lsn,
|
last_received_msg_lsn,
|
||||||
@@ -1191,10 +1199,15 @@ fn synthetic_size_html_response(
|
|||||||
timeline_map.insert(ti.timeline_id, index);
|
timeline_map.insert(ti.timeline_id, index);
|
||||||
timeline_ids.push(ti.timeline_id.to_string());
|
timeline_ids.push(ti.timeline_id.to_string());
|
||||||
}
|
}
|
||||||
let seg_to_branch: Vec<usize> = inputs
|
let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs
|
||||||
.segments
|
.segments
|
||||||
.iter()
|
.iter()
|
||||||
.map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
|
.map(|seg| {
|
||||||
|
(
|
||||||
|
*timeline_map.get(&seg.timeline_id).unwrap(),
|
||||||
|
seg.kind.into(),
|
||||||
|
)
|
||||||
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let svg =
|
let svg =
|
||||||
@@ -1296,7 +1309,7 @@ async fn update_tenant_config_handler(
|
|||||||
|
|
||||||
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||||
tenant.set_new_tenant_config(new_tenant_conf);
|
tenant.set_new_tenant_config(new_tenant_conf);
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
@@ -1527,15 +1540,13 @@ async fn handle_tenant_break(
|
|||||||
|
|
||||||
// Obtains an lsn lease on the given timeline.
|
// Obtains an lsn lease on the given timeline.
|
||||||
async fn lsn_lease_handler(
|
async fn lsn_lease_handler(
|
||||||
request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
let lsn = json_request::<LsnLeaseRequest>(&mut request).await?.lsn;
|
||||||
let lsn: Lsn = parse_query_param(&request, "lsn")?
|
|
||||||
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
|
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
@@ -2396,6 +2407,189 @@ async fn post_top_tenants(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn put_tenant_timeline_import_basebackup(
|
||||||
|
request: Request<Body>,
|
||||||
|
_cancel: CancellationToken,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
|
let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
|
||||||
|
let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
|
||||||
|
let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
|
||||||
|
|
||||||
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
|
|
||||||
|
let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
|
||||||
|
async move {
|
||||||
|
let state = get_state(&request);
|
||||||
|
let tenant = state
|
||||||
|
.tenant_manager
|
||||||
|
.get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
|
||||||
|
|
||||||
|
let broker_client = state.broker_client.clone();
|
||||||
|
|
||||||
|
let mut body = StreamReader::new(request.into_body().map(|res| {
|
||||||
|
res.map_err(|error| {
|
||||||
|
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
|
||||||
|
})
|
||||||
|
}));
|
||||||
|
|
||||||
|
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||||
|
|
||||||
|
let timeline = tenant
|
||||||
|
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
||||||
|
.map_err(ApiError::InternalServerError)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||||
|
// We might have some wal to import as well, and we should prevent compute
|
||||||
|
// from connecting before that and writing conflicting wal.
|
||||||
|
//
|
||||||
|
// This is not relevant for pageserver->pageserver migrations, since there's
|
||||||
|
// no wal to import. But should be fixed if we want to import from postgres.
|
||||||
|
|
||||||
|
// TODO leave clean state on error. For now you can use detach to clean
|
||||||
|
// up broken state from a failed import.
|
||||||
|
|
||||||
|
// Import basebackup provided via CopyData
|
||||||
|
info!("importing basebackup");
|
||||||
|
|
||||||
|
timeline
|
||||||
|
.import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
|
// Read the end of the tar archive.
|
||||||
|
read_tar_eof(body)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
|
// TODO check checksum
|
||||||
|
// Meanwhile you can verify client-side by taking fullbackup
|
||||||
|
// and checking that it matches in size with what was imported.
|
||||||
|
// It wouldn't work if base came from vanilla postgres though,
|
||||||
|
// since we discard some log files.
|
||||||
|
|
||||||
|
info!("done");
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
}
|
||||||
|
.instrument(span)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn put_tenant_timeline_import_wal(
|
||||||
|
request: Request<Body>,
|
||||||
|
_cancel: CancellationToken,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
|
let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
|
||||||
|
let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
|
||||||
|
|
||||||
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
|
|
||||||
|
let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
|
||||||
|
async move {
|
||||||
|
let state = get_state(&request);
|
||||||
|
|
||||||
|
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
|
||||||
|
|
||||||
|
let mut body = StreamReader::new(request.into_body().map(|res| {
|
||||||
|
res.map_err(|error| {
|
||||||
|
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
|
||||||
|
})
|
||||||
|
}));
|
||||||
|
|
||||||
|
let last_record_lsn = timeline.get_last_record_lsn();
|
||||||
|
if last_record_lsn != start_lsn {
|
||||||
|
return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO leave clean state on error. For now you can use detach to clean
|
||||||
|
// up broken state from a failed import.
|
||||||
|
|
||||||
|
// Import wal provided via CopyData
|
||||||
|
info!("importing wal");
|
||||||
|
crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
|
||||||
|
info!("wal import complete");
|
||||||
|
|
||||||
|
// Read the end of the tar archive.
|
||||||
|
read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
|
// TODO Does it make sense to overshoot?
|
||||||
|
if timeline.get_last_record_lsn() < end_lsn {
|
||||||
|
return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
|
||||||
|
// We only want to persist the data, and it doesn't matter if it's in the
|
||||||
|
// shape of deltas or images.
|
||||||
|
info!("flushing layers");
|
||||||
|
timeline.freeze_and_flush().await.map_err(|e| match e {
|
||||||
|
tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
|
||||||
|
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
info!("done");
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
}.instrument(span).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read the end of a tar archive.
|
||||||
|
///
|
||||||
|
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
|
||||||
|
/// `tokio_tar` already read the first such block. Read the second all-zeros block,
|
||||||
|
/// and check that there is no more data after the EOF marker.
|
||||||
|
///
|
||||||
|
/// 'tar' command can also write extra blocks of zeros, up to a record
|
||||||
|
/// size, controlled by the --record-size argument. Ignore them too.
|
||||||
|
async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
|
||||||
|
use tokio::io::AsyncReadExt;
|
||||||
|
let mut buf = [0u8; 512];
|
||||||
|
|
||||||
|
// Read the all-zeros block, and verify it
|
||||||
|
let mut total_bytes = 0;
|
||||||
|
while total_bytes < 512 {
|
||||||
|
let nbytes = reader.read(&mut buf[total_bytes..]).await?;
|
||||||
|
total_bytes += nbytes;
|
||||||
|
if nbytes == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if total_bytes < 512 {
|
||||||
|
anyhow::bail!("incomplete or invalid tar EOF marker");
|
||||||
|
}
|
||||||
|
if !buf.iter().all(|&x| x == 0) {
|
||||||
|
anyhow::bail!("invalid tar EOF marker");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Drain any extra zero-blocks after the EOF marker
|
||||||
|
let mut trailing_bytes = 0;
|
||||||
|
let mut seen_nonzero_bytes = false;
|
||||||
|
loop {
|
||||||
|
let nbytes = reader.read(&mut buf).await?;
|
||||||
|
trailing_bytes += nbytes;
|
||||||
|
if !buf.iter().all(|&x| x == 0) {
|
||||||
|
seen_nonzero_bytes = true;
|
||||||
|
}
|
||||||
|
if nbytes == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if seen_nonzero_bytes {
|
||||||
|
anyhow::bail!("unexpected non-zero bytes after the tar archive");
|
||||||
|
}
|
||||||
|
if trailing_bytes % 512 != 0 {
|
||||||
|
anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Common functionality of all the HTTP API handlers.
|
/// Common functionality of all the HTTP API handlers.
|
||||||
///
|
///
|
||||||
/// - Adds a tracing span to each request (by `request_span`)
|
/// - Adds a tracing span to each request (by `request_span`)
|
||||||
@@ -2690,5 +2884,13 @@ pub fn make_router(
|
|||||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
|
||||||
|r| testing_api_handler("perf_info", r, perf_info),
|
|r| testing_api_handler("perf_info", r, perf_info),
|
||||||
)
|
)
|
||||||
|
.put(
|
||||||
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
|
||||||
|
|r| api_handler(r, put_tenant_timeline_import_basebackup),
|
||||||
|
)
|
||||||
|
.put(
|
||||||
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
|
||||||
|
|r| api_handler(r, put_tenant_timeline_import_wal),
|
||||||
|
)
|
||||||
.any(handler_404))
|
.any(handler_404))
|
||||||
}
|
}
|
||||||
|
|||||||
46
pageserver/src/l0_flush.rs
Normal file
46
pageserver/src/l0_flush.rs
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
use std::{num::NonZeroUsize, sync::Arc};
|
||||||
|
|
||||||
|
use crate::tenant::ephemeral_file;
|
||||||
|
|
||||||
|
#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||||
|
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||||
|
pub enum L0FlushConfig {
|
||||||
|
#[default]
|
||||||
|
PageCached,
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
Direct { max_concurrency: NonZeroUsize },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||||
|
|
||||||
|
pub(crate) enum Inner {
|
||||||
|
PageCached,
|
||||||
|
Direct { semaphore: tokio::sync::Semaphore },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl L0FlushGlobalState {
|
||||||
|
pub fn new(config: L0FlushConfig) -> Self {
|
||||||
|
match config {
|
||||||
|
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
|
||||||
|
L0FlushConfig::Direct { max_concurrency } => {
|
||||||
|
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
|
||||||
|
Self(Arc::new(Inner::Direct { semaphore }))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn inner(&self) -> &Arc<Inner> {
|
||||||
|
&self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl L0FlushConfig {
|
||||||
|
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
|
||||||
|
use L0FlushConfig::*;
|
||||||
|
match self {
|
||||||
|
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
|
||||||
|
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,6 +11,7 @@ pub mod deletion_queue;
|
|||||||
pub mod disk_usage_eviction_task;
|
pub mod disk_usage_eviction_task;
|
||||||
pub mod http;
|
pub mod http;
|
||||||
pub mod import_datadir;
|
pub mod import_datadir;
|
||||||
|
pub mod l0_flush;
|
||||||
pub use pageserver_api::keyspace;
|
pub use pageserver_api::keyspace;
|
||||||
pub mod aux_file;
|
pub mod aux_file;
|
||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
@@ -22,7 +23,6 @@ pub mod span;
|
|||||||
pub(crate) mod statvfs;
|
pub(crate) mod statvfs;
|
||||||
pub mod task_mgr;
|
pub mod task_mgr;
|
||||||
pub mod tenant;
|
pub mod tenant;
|
||||||
pub mod trace;
|
|
||||||
pub mod utilization;
|
pub mod utilization;
|
||||||
pub mod virtual_file;
|
pub mod virtual_file;
|
||||||
pub mod walingest;
|
pub mod walingest;
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use metrics::{
|
|||||||
};
|
};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
use strum::{EnumCount, VariantNames};
|
||||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||||
use tracing::warn;
|
use tracing::warn;
|
||||||
use utils::id::TimelineId;
|
use utils::id::TimelineId;
|
||||||
@@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
|
register_uint_gauge_vec!(
|
||||||
|
"pageserver_pitr_history_size",
|
||||||
|
"Data written since PITR cutoff on this timeline",
|
||||||
|
&["tenant_id", "shard_id", "timeline_id"]
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
|
static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
|
register_uint_gauge_vec!(
|
||||||
|
"pageserver_archive_size",
|
||||||
|
"Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
|
||||||
|
&["tenant_id", "shard_id", "timeline_id"]
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
|
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||||
register_int_gauge_vec!(
|
register_int_gauge_vec!(
|
||||||
"pageserver_standby_horizon",
|
"pageserver_standby_horizon",
|
||||||
@@ -476,7 +494,7 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
|
|||||||
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
register_uint_gauge_vec!(
|
register_uint_gauge_vec!(
|
||||||
"pageserver_resident_physical_size",
|
"pageserver_resident_physical_size",
|
||||||
"The size of the layer files present in the pageserver's filesystem.",
|
"The size of the layer files present in the pageserver's filesystem, for attached locations.",
|
||||||
&["tenant_id", "shard_id", "timeline_id"]
|
&["tenant_id", "shard_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
@@ -551,6 +569,22 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||||
|
register_int_counter!(
|
||||||
|
"pageserver_circuit_breaker_broken",
|
||||||
|
"How many times a circuit breaker has broken"
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||||
|
register_int_counter!(
|
||||||
|
"pageserver_circuit_breaker_unbroken",
|
||||||
|
"How many times a circuit breaker has been un-broken (recovered)"
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
pub(crate) mod initial_logical_size {
|
pub(crate) mod initial_logical_size {
|
||||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
@@ -1076,21 +1110,12 @@ pub(crate) mod virtual_file_io_engine {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
struct GlobalAndPerTimelineHistogram {
|
|
||||||
global: Histogram,
|
|
||||||
per_tenant_timeline: Histogram,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl GlobalAndPerTimelineHistogram {
|
|
||||||
fn observe(&self, value: f64) {
|
|
||||||
self.global.observe(value);
|
|
||||||
self.per_tenant_timeline.observe(value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||||
h: &'a GlobalAndPerTimelineHistogram,
|
global_metric: &'a Histogram,
|
||||||
|
|
||||||
|
// Optional because not all op types are tracked per-timeline
|
||||||
|
timeline_metric: Option<&'a Histogram>,
|
||||||
|
|
||||||
ctx: &'c RequestContext,
|
ctx: &'c RequestContext,
|
||||||
start: std::time::Instant,
|
start: std::time::Instant,
|
||||||
op: SmgrQueryType,
|
op: SmgrQueryType,
|
||||||
@@ -1121,7 +1146,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
|||||||
elapsed
|
elapsed
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
self.h.observe(ex_throttled.as_secs_f64());
|
self.global_metric.observe(ex_throttled.as_secs_f64());
|
||||||
|
if let Some(timeline_metric) = self.timeline_metric {
|
||||||
|
timeline_metric.observe(ex_throttled.as_secs_f64());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1146,7 +1174,8 @@ pub enum SmgrQueryType {
|
|||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||||
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
global_metrics: [Histogram; SmgrQueryType::COUNT],
|
||||||
|
per_timeline_getpage: Histogram,
|
||||||
}
|
}
|
||||||
|
|
||||||
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||||
@@ -1224,27 +1253,32 @@ impl SmgrQueryTimePerTimeline {
|
|||||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||||
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
|
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
|
||||||
let timeline_id = timeline_id.to_string();
|
let timeline_id = timeline_id.to_string();
|
||||||
let metrics = std::array::from_fn(|i| {
|
let global_metrics = std::array::from_fn(|i| {
|
||||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||||
let global = SMGR_QUERY_TIME_GLOBAL
|
SMGR_QUERY_TIME_GLOBAL
|
||||||
.get_metric_with_label_values(&[op.into()])
|
.get_metric_with_label_values(&[op.into()])
|
||||||
.unwrap();
|
.unwrap()
|
||||||
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
|
||||||
.get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
|
|
||||||
.unwrap();
|
|
||||||
GlobalAndPerTimelineHistogram {
|
|
||||||
global,
|
|
||||||
per_tenant_timeline,
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
Self { metrics }
|
|
||||||
|
let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||||
|
.get_metric_with_label_values(&[
|
||||||
|
SmgrQueryType::GetPageAtLsn.into(),
|
||||||
|
&tenant_id,
|
||||||
|
&shard_slug,
|
||||||
|
&timeline_id,
|
||||||
|
])
|
||||||
|
.unwrap();
|
||||||
|
Self {
|
||||||
|
global_metrics,
|
||||||
|
per_timeline_getpage,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
pub(crate) fn start_timer<'c: 'a, 'a>(
|
pub(crate) fn start_timer<'c: 'a, 'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
op: SmgrQueryType,
|
op: SmgrQueryType,
|
||||||
ctx: &'c RequestContext,
|
ctx: &'c RequestContext,
|
||||||
) -> impl Drop + '_ {
|
) -> Option<impl Drop + '_> {
|
||||||
let metric = &self.metrics[op as usize];
|
let global_metric = &self.global_metrics[op as usize];
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
match ctx.micros_spent_throttled.open() {
|
match ctx.micros_spent_throttled.open() {
|
||||||
Ok(()) => (),
|
Ok(()) => (),
|
||||||
@@ -1263,12 +1297,20 @@ impl SmgrQueryTimePerTimeline {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GlobalAndPerTimelineHistogramTimer {
|
|
||||||
h: metric,
|
let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
|
||||||
|
Some(&self.per_timeline_getpage)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
Some(GlobalAndPerTimelineHistogramTimer {
|
||||||
|
global_metric,
|
||||||
|
timeline_metric,
|
||||||
ctx,
|
ctx,
|
||||||
start,
|
start,
|
||||||
op,
|
op,
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1315,17 +1357,9 @@ mod smgr_query_time_tests {
|
|||||||
let get_counts = || {
|
let get_counts = || {
|
||||||
let global: u64 = ops
|
let global: u64 = ops
|
||||||
.iter()
|
.iter()
|
||||||
.map(|op| metrics.metrics[*op as usize].global.get_sample_count())
|
.map(|op| metrics.global_metrics[*op as usize].get_sample_count())
|
||||||
.sum();
|
.sum();
|
||||||
let per_tenant_timeline: u64 = ops
|
(global, metrics.per_timeline_getpage.get_sample_count())
|
||||||
.iter()
|
|
||||||
.map(|op| {
|
|
||||||
metrics.metrics[*op as usize]
|
|
||||||
.per_tenant_timeline
|
|
||||||
.get_sample_count()
|
|
||||||
})
|
|
||||||
.sum();
|
|
||||||
(global, per_tenant_timeline)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||||
@@ -1336,7 +1370,12 @@ mod smgr_query_time_tests {
|
|||||||
drop(timer);
|
drop(timer);
|
||||||
|
|
||||||
let (post_global, post_per_tenant_timeline) = get_counts();
|
let (post_global, post_per_tenant_timeline) = get_counts();
|
||||||
assert_eq!(post_per_tenant_timeline, 1);
|
if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
|
||||||
|
// getpage ops are tracked per-timeline, others aren't
|
||||||
|
assert_eq!(post_per_tenant_timeline, 1);
|
||||||
|
} else {
|
||||||
|
assert_eq!(post_per_tenant_timeline, 0);
|
||||||
|
}
|
||||||
assert!(post_global > pre_global);
|
assert!(post_global > pre_global);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1433,10 +1472,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||||
register_int_gauge_vec!(
|
register_int_counter_pair_vec!(
|
||||||
"pageserver_live_connections",
|
"pageserver_live_connections_started",
|
||||||
"Number of live network connections",
|
"Number of network connections that we started handling",
|
||||||
|
"pageserver_live_connections_finished",
|
||||||
|
"Number of network connections that we finished handling",
|
||||||
&["pageserver_connection_kind"]
|
&["pageserver_connection_kind"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
@@ -1447,10 +1488,7 @@ pub(crate) enum ComputeCommandKind {
|
|||||||
PageStreamV2,
|
PageStreamV2,
|
||||||
PageStream,
|
PageStream,
|
||||||
Basebackup,
|
Basebackup,
|
||||||
GetLastRecordRlsn,
|
|
||||||
Fullbackup,
|
Fullbackup,
|
||||||
ImportBasebackup,
|
|
||||||
ImportWal,
|
|
||||||
LeaseLsn,
|
LeaseLsn,
|
||||||
Show,
|
Show,
|
||||||
}
|
}
|
||||||
@@ -1691,6 +1729,15 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
|
register_uint_gauge_vec!(
|
||||||
|
"pageserver_secondary_resident_physical_size",
|
||||||
|
"The size of the layer files present in the pageserver's filesystem, for secondary locations.",
|
||||||
|
&["tenant_id", "shard_id"]
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
pub enum RemoteOpKind {
|
pub enum RemoteOpKind {
|
||||||
Upload,
|
Upload,
|
||||||
@@ -2093,6 +2140,8 @@ pub(crate) struct TimelineMetrics {
|
|||||||
pub garbage_collect_histo: StorageTimeMetrics,
|
pub garbage_collect_histo: StorageTimeMetrics,
|
||||||
pub find_gc_cutoffs_histo: StorageTimeMetrics,
|
pub find_gc_cutoffs_histo: StorageTimeMetrics,
|
||||||
pub last_record_gauge: IntGauge,
|
pub last_record_gauge: IntGauge,
|
||||||
|
pub pitr_history_size: UIntGauge,
|
||||||
|
pub archival_size: UIntGauge,
|
||||||
pub standby_horizon_gauge: IntGauge,
|
pub standby_horizon_gauge: IntGauge,
|
||||||
pub resident_physical_size_gauge: UIntGauge,
|
pub resident_physical_size_gauge: UIntGauge,
|
||||||
/// copy of LayeredTimeline.current_logical_size
|
/// copy of LayeredTimeline.current_logical_size
|
||||||
@@ -2166,6 +2215,15 @@ impl TimelineMetrics {
|
|||||||
let last_record_gauge = LAST_RECORD_LSN
|
let last_record_gauge = LAST_RECORD_LSN
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
let pitr_history_size = PITR_HISTORY_SIZE
|
||||||
|
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let archival_size = TIMELINE_ARCHIVE_SIZE
|
||||||
|
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let standby_horizon_gauge = STANDBY_HORIZON
|
let standby_horizon_gauge = STANDBY_HORIZON
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -2218,6 +2276,8 @@ impl TimelineMetrics {
|
|||||||
find_gc_cutoffs_histo,
|
find_gc_cutoffs_histo,
|
||||||
load_layer_map_histo,
|
load_layer_map_histo,
|
||||||
last_record_gauge,
|
last_record_gauge,
|
||||||
|
pitr_history_size,
|
||||||
|
archival_size,
|
||||||
standby_horizon_gauge,
|
standby_horizon_gauge,
|
||||||
resident_physical_size_gauge,
|
resident_physical_size_gauge,
|
||||||
current_logical_size_gauge,
|
current_logical_size_gauge,
|
||||||
@@ -2275,6 +2335,10 @@ impl TimelineMetrics {
|
|||||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||||
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||||
|
let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||||
|
|
||||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||||
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||||
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||||
@@ -2308,14 +2372,12 @@ impl TimelineMetrics {
|
|||||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
|
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for op in SmgrQueryType::iter() {
|
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||||
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
SmgrQueryType::GetPageAtLsn.into(),
|
||||||
op.into(),
|
tenant_id,
|
||||||
tenant_id,
|
shard_id,
|
||||||
shard_id,
|
timeline_id,
|
||||||
timeline_id,
|
]);
|
||||||
]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,9 +4,7 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use async_compression::tokio::write::GzipEncoder;
|
use async_compression::tokio::write::GzipEncoder;
|
||||||
use bytes::Buf;
|
use bytes::Buf;
|
||||||
use bytes::Bytes;
|
|
||||||
use futures::stream::FuturesUnordered;
|
use futures::stream::FuturesUnordered;
|
||||||
use futures::Stream;
|
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use pageserver_api::key::Key;
|
use pageserver_api::key::Key;
|
||||||
use pageserver_api::models::TenantState;
|
use pageserver_api::models::TenantState;
|
||||||
@@ -28,7 +26,6 @@ use std::borrow::Cow;
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::net::TcpListener;
|
use std::net::TcpListener;
|
||||||
use std::pin::pin;
|
|
||||||
use std::str;
|
use std::str;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -37,10 +34,8 @@ use std::time::Instant;
|
|||||||
use std::time::SystemTime;
|
use std::time::SystemTime;
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::io::AsyncWriteExt;
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
use tokio_util::io::StreamReader;
|
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::ConnectionId;
|
|
||||||
use utils::sync::gate::GateGuard;
|
use utils::sync::gate::GateGuard;
|
||||||
use utils::{
|
use utils::{
|
||||||
auth::{Claims, Scope, SwappableJwtAuth},
|
auth::{Claims, Scope, SwappableJwtAuth},
|
||||||
@@ -53,9 +48,8 @@ use crate::auth::check_permission;
|
|||||||
use crate::basebackup;
|
use crate::basebackup;
|
||||||
use crate::basebackup::BasebackupError;
|
use crate::basebackup::BasebackupError;
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::import_datadir::import_wal_from_tar;
|
|
||||||
use crate::metrics;
|
use crate::metrics;
|
||||||
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
|
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
|
||||||
use crate::pgdatadir_mapping::Version;
|
use crate::pgdatadir_mapping::Version;
|
||||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||||
@@ -66,13 +60,11 @@ use crate::tenant::mgr::GetTenantError;
|
|||||||
use crate::tenant::mgr::ShardResolveResult;
|
use crate::tenant::mgr::ShardResolveResult;
|
||||||
use crate::tenant::mgr::ShardSelector;
|
use crate::tenant::mgr::ShardSelector;
|
||||||
use crate::tenant::mgr::TenantManager;
|
use crate::tenant::mgr::TenantManager;
|
||||||
use crate::tenant::timeline::FlushLayerError;
|
|
||||||
use crate::tenant::timeline::WaitLsnError;
|
use crate::tenant::timeline::WaitLsnError;
|
||||||
use crate::tenant::GetTimelineError;
|
use crate::tenant::GetTimelineError;
|
||||||
use crate::tenant::PageReconstructError;
|
use crate::tenant::PageReconstructError;
|
||||||
use crate::tenant::Tenant;
|
use crate::tenant::Tenant;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
use crate::trace::Tracer;
|
|
||||||
use pageserver_api::key::rel_block_to_key;
|
use pageserver_api::key::rel_block_to_key;
|
||||||
use pageserver_api::reltag::SlruKind;
|
use pageserver_api::reltag::SlruKind;
|
||||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||||
@@ -82,56 +74,6 @@ use postgres_ffi::BLCKSZ;
|
|||||||
// is not yet in state [`TenantState::Active`].
|
// is not yet in state [`TenantState::Active`].
|
||||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||||
|
|
||||||
/// Read the end of a tar archive.
|
|
||||||
///
|
|
||||||
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
|
|
||||||
/// `tokio_tar` already read the first such block. Read the second all-zeros block,
|
|
||||||
/// and check that there is no more data after the EOF marker.
|
|
||||||
///
|
|
||||||
/// 'tar' command can also write extra blocks of zeros, up to a record
|
|
||||||
/// size, controlled by the --record-size argument. Ignore them too.
|
|
||||||
async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
|
|
||||||
use tokio::io::AsyncReadExt;
|
|
||||||
let mut buf = [0u8; 512];
|
|
||||||
|
|
||||||
// Read the all-zeros block, and verify it
|
|
||||||
let mut total_bytes = 0;
|
|
||||||
while total_bytes < 512 {
|
|
||||||
let nbytes = reader.read(&mut buf[total_bytes..]).await?;
|
|
||||||
total_bytes += nbytes;
|
|
||||||
if nbytes == 0 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if total_bytes < 512 {
|
|
||||||
anyhow::bail!("incomplete or invalid tar EOF marker");
|
|
||||||
}
|
|
||||||
if !buf.iter().all(|&x| x == 0) {
|
|
||||||
anyhow::bail!("invalid tar EOF marker");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Drain any extra zero-blocks after the EOF marker
|
|
||||||
let mut trailing_bytes = 0;
|
|
||||||
let mut seen_nonzero_bytes = false;
|
|
||||||
loop {
|
|
||||||
let nbytes = reader.read(&mut buf).await?;
|
|
||||||
trailing_bytes += nbytes;
|
|
||||||
if !buf.iter().all(|&x| x == 0) {
|
|
||||||
seen_nonzero_bytes = true;
|
|
||||||
}
|
|
||||||
if nbytes == 0 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if seen_nonzero_bytes {
|
|
||||||
anyhow::bail!("unexpected non-zero bytes after the tar archive");
|
|
||||||
}
|
|
||||||
if trailing_bytes % 512 != 0 {
|
|
||||||
anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -141,7 +83,6 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
|
|||||||
///
|
///
|
||||||
pub async fn libpq_listener_main(
|
pub async fn libpq_listener_main(
|
||||||
tenant_manager: Arc<TenantManager>,
|
tenant_manager: Arc<TenantManager>,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
|
||||||
auth: Option<Arc<SwappableJwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
listener: TcpListener,
|
listener: TcpListener,
|
||||||
auth_type: AuthType,
|
auth_type: AuthType,
|
||||||
@@ -186,7 +127,6 @@ pub async fn libpq_listener_main(
|
|||||||
false,
|
false,
|
||||||
page_service_conn_main(
|
page_service_conn_main(
|
||||||
tenant_manager.clone(),
|
tenant_manager.clone(),
|
||||||
broker_client.clone(),
|
|
||||||
local_auth,
|
local_auth,
|
||||||
socket,
|
socket,
|
||||||
auth_type,
|
auth_type,
|
||||||
@@ -209,20 +149,14 @@ pub async fn libpq_listener_main(
|
|||||||
#[instrument(skip_all, fields(peer_addr))]
|
#[instrument(skip_all, fields(peer_addr))]
|
||||||
async fn page_service_conn_main(
|
async fn page_service_conn_main(
|
||||||
tenant_manager: Arc<TenantManager>,
|
tenant_manager: Arc<TenantManager>,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
|
||||||
auth: Option<Arc<SwappableJwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
socket: tokio::net::TcpStream,
|
socket: tokio::net::TcpStream,
|
||||||
auth_type: AuthType,
|
auth_type: AuthType,
|
||||||
connection_ctx: RequestContext,
|
connection_ctx: RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Immediately increment the gauge, then create a job to decrement it on task exit.
|
let _guard = LIVE_CONNECTIONS
|
||||||
// One of the pros of `defer!` is that this will *most probably*
|
.with_label_values(&["page_service"])
|
||||||
// get called, even in presence of panics.
|
.guard();
|
||||||
let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
|
|
||||||
gauge.inc();
|
|
||||||
scopeguard::defer! {
|
|
||||||
gauge.dec();
|
|
||||||
}
|
|
||||||
|
|
||||||
socket
|
socket
|
||||||
.set_nodelay(true)
|
.set_nodelay(true)
|
||||||
@@ -267,12 +201,11 @@ async fn page_service_conn_main(
|
|||||||
// and create a child per-query context when it invokes process_query.
|
// and create a child per-query context when it invokes process_query.
|
||||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||||
// and create the per-query context in process_query ourselves.
|
// and create the per-query context in process_query ourselves.
|
||||||
let mut conn_handler =
|
let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
|
||||||
PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
|
|
||||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||||
|
|
||||||
match pgbackend
|
match pgbackend
|
||||||
.run(&mut conn_handler, task_mgr::shutdown_watcher)
|
.run(&mut conn_handler, &task_mgr::shutdown_token())
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
@@ -299,7 +232,6 @@ struct HandlerTimeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct PageServerHandler {
|
struct PageServerHandler {
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
|
||||||
auth: Option<Arc<SwappableJwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
claims: Option<Claims>,
|
claims: Option<Claims>,
|
||||||
|
|
||||||
@@ -391,13 +323,11 @@ impl From<WaitLsnError> for QueryError {
|
|||||||
impl PageServerHandler {
|
impl PageServerHandler {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
tenant_manager: Arc<TenantManager>,
|
tenant_manager: Arc<TenantManager>,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
|
||||||
auth: Option<Arc<SwappableJwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
connection_ctx: RequestContext,
|
connection_ctx: RequestContext,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
PageServerHandler {
|
PageServerHandler {
|
||||||
tenant_manager,
|
tenant_manager,
|
||||||
broker_client,
|
|
||||||
auth,
|
auth,
|
||||||
claims: None,
|
claims: None,
|
||||||
connection_ctx,
|
connection_ctx,
|
||||||
@@ -480,73 +410,6 @@ impl PageServerHandler {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn copyin_stream<'a, IO>(
|
|
||||||
&'a self,
|
|
||||||
pgb: &'a mut PostgresBackend<IO>,
|
|
||||||
cancel: &'a CancellationToken,
|
|
||||||
) -> impl Stream<Item = io::Result<Bytes>> + 'a
|
|
||||||
where
|
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
|
||||||
{
|
|
||||||
async_stream::try_stream! {
|
|
||||||
loop {
|
|
||||||
let msg = tokio::select! {
|
|
||||||
biased;
|
|
||||||
|
|
||||||
_ = cancel.cancelled() => {
|
|
||||||
// We were requested to shut down.
|
|
||||||
let msg = "pageserver is shutting down";
|
|
||||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
|
||||||
Err(QueryError::Shutdown)
|
|
||||||
}
|
|
||||||
|
|
||||||
msg = pgb.read_message() => { msg.map_err(QueryError::from)}
|
|
||||||
};
|
|
||||||
|
|
||||||
match msg {
|
|
||||||
Ok(Some(message)) => {
|
|
||||||
let copy_data_bytes = match message {
|
|
||||||
FeMessage::CopyData(bytes) => bytes,
|
|
||||||
FeMessage::CopyDone => { break },
|
|
||||||
FeMessage::Sync => continue,
|
|
||||||
FeMessage::Terminate => {
|
|
||||||
let msg = "client terminated connection with Terminate message during COPY";
|
|
||||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
|
||||||
// error can't happen here, ErrorResponse serialization should be always ok
|
|
||||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
|
||||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
m => {
|
|
||||||
let msg = format!("unexpected message {m:?}");
|
|
||||||
// error can't happen here, ErrorResponse serialization should be always ok
|
|
||||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
|
|
||||||
Err(io::Error::new(io::ErrorKind::Other, msg))?;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
yield copy_data_bytes;
|
|
||||||
}
|
|
||||||
Ok(None) => {
|
|
||||||
let msg = "client closed connection during COPY";
|
|
||||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
|
||||||
// error can't happen here, ErrorResponse serialization should be always ok
|
|
||||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
|
||||||
self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
||||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
|
||||||
}
|
|
||||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
|
||||||
Err(io_error)?;
|
|
||||||
}
|
|
||||||
Err(other) => {
|
|
||||||
Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
async fn handle_pagerequests<IO>(
|
async fn handle_pagerequests<IO>(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -565,18 +428,6 @@ impl PageServerHandler {
|
|||||||
.get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
|
.get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// Make request tracer if needed
|
|
||||||
let mut tracer = if tenant.get_trace_read_requests() {
|
|
||||||
let connection_id = ConnectionId::generate();
|
|
||||||
let path =
|
|
||||||
tenant
|
|
||||||
.conf
|
|
||||||
.trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
|
|
||||||
Some(Tracer::new(path))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
// switch client to COPYBOTH
|
// switch client to COPYBOTH
|
||||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||||
self.flush_cancellable(pgb, &tenant.cancel).await?;
|
self.flush_cancellable(pgb, &tenant.cancel).await?;
|
||||||
@@ -608,11 +459,6 @@ impl PageServerHandler {
|
|||||||
trace!("query: {copy_data_bytes:?}");
|
trace!("query: {copy_data_bytes:?}");
|
||||||
fail::fail_point!("ps::handle-pagerequest-message");
|
fail::fail_point!("ps::handle-pagerequest-message");
|
||||||
|
|
||||||
// Trace request if needed
|
|
||||||
if let Some(t) = tracer.as_mut() {
|
|
||||||
t.trace(©_data_bytes)
|
|
||||||
}
|
|
||||||
|
|
||||||
let neon_fe_msg =
|
let neon_fe_msg =
|
||||||
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
||||||
|
|
||||||
@@ -718,128 +564,6 @@ impl PageServerHandler {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
|
||||||
#[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
|
|
||||||
async fn handle_import_basebackup<IO>(
|
|
||||||
&self,
|
|
||||||
pgb: &mut PostgresBackend<IO>,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
base_lsn: Lsn,
|
|
||||||
_end_lsn: Lsn,
|
|
||||||
pg_version: u32,
|
|
||||||
ctx: RequestContext,
|
|
||||||
) -> Result<(), QueryError>
|
|
||||||
where
|
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
|
||||||
{
|
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
|
||||||
|
|
||||||
// Create empty timeline
|
|
||||||
info!("creating new timeline");
|
|
||||||
let tenant = self
|
|
||||||
.get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
|
|
||||||
.await?;
|
|
||||||
let timeline = tenant
|
|
||||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
// TODO mark timeline as not ready until it reaches end_lsn.
|
|
||||||
// We might have some wal to import as well, and we should prevent compute
|
|
||||||
// from connecting before that and writing conflicting wal.
|
|
||||||
//
|
|
||||||
// This is not relevant for pageserver->pageserver migrations, since there's
|
|
||||||
// no wal to import. But should be fixed if we want to import from postgres.
|
|
||||||
|
|
||||||
// TODO leave clean state on error. For now you can use detach to clean
|
|
||||||
// up broken state from a failed import.
|
|
||||||
|
|
||||||
// Import basebackup provided via CopyData
|
|
||||||
info!("importing basebackup");
|
|
||||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
|
||||||
self.flush_cancellable(pgb, &tenant.cancel).await?;
|
|
||||||
|
|
||||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
|
||||||
timeline
|
|
||||||
.import_basebackup_from_tar(
|
|
||||||
tenant.clone(),
|
|
||||||
&mut copyin_reader,
|
|
||||||
base_lsn,
|
|
||||||
self.broker_client.clone(),
|
|
||||||
&ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
// Read the end of the tar archive.
|
|
||||||
read_tar_eof(copyin_reader).await?;
|
|
||||||
|
|
||||||
// TODO check checksum
|
|
||||||
// Meanwhile you can verify client-side by taking fullbackup
|
|
||||||
// and checking that it matches in size with what was imported.
|
|
||||||
// It wouldn't work if base came from vanilla postgres though,
|
|
||||||
// since we discard some log files.
|
|
||||||
|
|
||||||
info!("done");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
|
|
||||||
async fn handle_import_wal<IO>(
|
|
||||||
&self,
|
|
||||||
pgb: &mut PostgresBackend<IO>,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
start_lsn: Lsn,
|
|
||||||
end_lsn: Lsn,
|
|
||||||
ctx: RequestContext,
|
|
||||||
) -> Result<(), QueryError>
|
|
||||||
where
|
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
|
||||||
{
|
|
||||||
let timeline = self
|
|
||||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
|
||||||
.await?;
|
|
||||||
let last_record_lsn = timeline.get_last_record_lsn();
|
|
||||||
if last_record_lsn != start_lsn {
|
|
||||||
return Err(QueryError::Other(
|
|
||||||
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO leave clean state on error. For now you can use detach to clean
|
|
||||||
// up broken state from a failed import.
|
|
||||||
|
|
||||||
// Import wal provided via CopyData
|
|
||||||
info!("importing wal");
|
|
||||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
|
||||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
|
||||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
|
|
||||||
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
|
||||||
info!("wal import complete");
|
|
||||||
|
|
||||||
// Read the end of the tar archive.
|
|
||||||
read_tar_eof(copyin_reader).await?;
|
|
||||||
|
|
||||||
// TODO Does it make sense to overshoot?
|
|
||||||
if timeline.get_last_record_lsn() < end_lsn {
|
|
||||||
return Err(QueryError::Other(
|
|
||||||
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
|
|
||||||
// We only want to persist the data, and it doesn't matter if it's in the
|
|
||||||
// shape of deltas or images.
|
|
||||||
info!("flushing layers");
|
|
||||||
timeline.freeze_and_flush().await.map_err(|e| match e {
|
|
||||||
FlushLayerError::Cancelled => QueryError::Shutdown,
|
|
||||||
other => QueryError::Other(other.into()),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
info!("done");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Helper function to handle the LSN from client request.
|
/// Helper function to handle the LSN from client request.
|
||||||
///
|
///
|
||||||
/// Each GetPage (and Exists and Nblocks) request includes information about
|
/// Each GetPage (and Exists and Nblocks) request includes information about
|
||||||
@@ -1656,53 +1380,6 @@ where
|
|||||||
metric_recording.observe(&res);
|
metric_recording.observe(&res);
|
||||||
res?;
|
res?;
|
||||||
}
|
}
|
||||||
// return pair of prev_lsn and last_lsn
|
|
||||||
else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
|
|
||||||
if params.len() != 2 {
|
|
||||||
return Err(QueryError::Other(anyhow::anyhow!(
|
|
||||||
"invalid param number for get_last_record_rlsn command"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
let tenant_id = TenantId::from_str(params[0])
|
|
||||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
|
||||||
let timeline_id = TimelineId::from_str(params[1])
|
|
||||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
|
||||||
|
|
||||||
tracing::Span::current()
|
|
||||||
.record("tenant_id", field::display(tenant_id))
|
|
||||||
.record("timeline_id", field::display(timeline_id));
|
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
|
||||||
|
|
||||||
COMPUTE_COMMANDS_COUNTERS
|
|
||||||
.for_command(ComputeCommandKind::GetLastRecordRlsn)
|
|
||||||
.inc();
|
|
||||||
|
|
||||||
async {
|
|
||||||
let timeline = self
|
|
||||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
|
||||||
|
|
||||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
|
||||||
RowDescriptor::text_col(b"prev_lsn"),
|
|
||||||
RowDescriptor::text_col(b"last_lsn"),
|
|
||||||
]))?
|
|
||||||
.write_message_noflush(&BeMessage::DataRow(&[
|
|
||||||
Some(end_of_timeline.prev.to_string().as_bytes()),
|
|
||||||
Some(end_of_timeline.last.to_string().as_bytes()),
|
|
||||||
]))?
|
|
||||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
|
||||||
anyhow::Ok(())
|
|
||||||
}
|
|
||||||
.instrument(info_span!(
|
|
||||||
"handle_get_last_record_lsn",
|
|
||||||
shard_id = tracing::field::Empty
|
|
||||||
))
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
// same as basebackup, but result includes relational data as well
|
// same as basebackup, but result includes relational data as well
|
||||||
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
|
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
|
||||||
if params.len() < 2 {
|
if params.len() < 2 {
|
||||||
@@ -1757,109 +1434,6 @@ where
|
|||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
} else if query_string.starts_with("import basebackup ") {
|
|
||||||
// Import the `base` section (everything but the wal) of a basebackup.
|
|
||||||
// Assumes the tenant already exists on this pageserver.
|
|
||||||
//
|
|
||||||
// Files are scheduled to be persisted to remote storage, and the
|
|
||||||
// caller should poll the http api to check when that is done.
|
|
||||||
//
|
|
||||||
// Example import command:
|
|
||||||
// 1. Get start/end LSN from backup_manifest file
|
|
||||||
// 2. Run:
|
|
||||||
// cat my_backup/base.tar | psql -h $PAGESERVER \
|
|
||||||
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
|
|
||||||
let params = &parts[2..];
|
|
||||||
if params.len() != 5 {
|
|
||||||
return Err(QueryError::Other(anyhow::anyhow!(
|
|
||||||
"invalid param number for import basebackup command"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
let tenant_id = TenantId::from_str(params[0])
|
|
||||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
|
||||||
let timeline_id = TimelineId::from_str(params[1])
|
|
||||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
|
||||||
let base_lsn = Lsn::from_str(params[2])
|
|
||||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
|
||||||
let end_lsn = Lsn::from_str(params[3])
|
|
||||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
|
|
||||||
let pg_version = u32::from_str(params[4])
|
|
||||||
.with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
|
|
||||||
|
|
||||||
tracing::Span::current()
|
|
||||||
.record("tenant_id", field::display(tenant_id))
|
|
||||||
.record("timeline_id", field::display(timeline_id));
|
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
|
||||||
|
|
||||||
COMPUTE_COMMANDS_COUNTERS
|
|
||||||
.for_command(ComputeCommandKind::ImportBasebackup)
|
|
||||||
.inc();
|
|
||||||
|
|
||||||
match self
|
|
||||||
.handle_import_basebackup(
|
|
||||||
pgb,
|
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
base_lsn,
|
|
||||||
end_lsn,
|
|
||||||
pg_version,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
|
||||||
Err(e) => {
|
|
||||||
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
|
|
||||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
|
||||||
&e.to_string(),
|
|
||||||
Some(e.pg_error_code()),
|
|
||||||
))?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
} else if query_string.starts_with("import wal ") {
|
|
||||||
// Import the `pg_wal` section of a basebackup.
|
|
||||||
//
|
|
||||||
// Files are scheduled to be persisted to remote storage, and the
|
|
||||||
// caller should poll the http api to check when that is done.
|
|
||||||
let params = &parts[2..];
|
|
||||||
if params.len() != 4 {
|
|
||||||
return Err(QueryError::Other(anyhow::anyhow!(
|
|
||||||
"invalid param number for import wal command"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
let tenant_id = TenantId::from_str(params[0])
|
|
||||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
|
||||||
let timeline_id = TimelineId::from_str(params[1])
|
|
||||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
|
||||||
let start_lsn = Lsn::from_str(params[2])
|
|
||||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
|
||||||
let end_lsn = Lsn::from_str(params[3])
|
|
||||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
|
|
||||||
|
|
||||||
tracing::Span::current()
|
|
||||||
.record("tenant_id", field::display(tenant_id))
|
|
||||||
.record("timeline_id", field::display(timeline_id));
|
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
|
||||||
|
|
||||||
COMPUTE_COMMANDS_COUNTERS
|
|
||||||
.for_command(ComputeCommandKind::ImportWal)
|
|
||||||
.inc();
|
|
||||||
|
|
||||||
match self
|
|
||||||
.handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
|
||||||
Err(e) => {
|
|
||||||
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
|
|
||||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
|
||||||
&e.to_string(),
|
|
||||||
Some(e.pg_error_code()),
|
|
||||||
))?
|
|
||||||
}
|
|
||||||
};
|
|
||||||
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
||||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||||
// on connect
|
// on connect
|
||||||
|
|||||||
@@ -522,7 +522,7 @@ impl Timeline {
|
|||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Option<TimestampTz>, PageReconstructError> {
|
) -> Result<Option<TimestampTz>, PageReconstructError> {
|
||||||
let mut max: Option<TimestampTz> = None;
|
let mut max: Option<TimestampTz> = None;
|
||||||
self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
|
self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
|
||||||
if let Some(max_prev) = max {
|
if let Some(max_prev) = max {
|
||||||
max = Some(max_prev.max(timestamp));
|
max = Some(max_prev.max(timestamp));
|
||||||
} else {
|
} else {
|
||||||
@@ -854,13 +854,14 @@ impl Timeline {
|
|||||||
result.add_key(DBDIR_KEY);
|
result.add_key(DBDIR_KEY);
|
||||||
|
|
||||||
// Fetch list of database dirs and iterate them
|
// Fetch list of database dirs and iterate them
|
||||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
let dbdir = self.list_dbdirs(lsn, ctx).await?;
|
||||||
let dbdir = DbDirectory::des(&buf)?;
|
let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
|
||||||
|
|
||||||
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
|
dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
|
||||||
dbs.sort_unstable();
|
for ((spcnode, dbnode), has_relmap_file) in dbs {
|
||||||
for (spcnode, dbnode) in dbs {
|
if has_relmap_file {
|
||||||
result.add_key(relmap_file_key(spcnode, dbnode));
|
result.add_key(relmap_file_key(spcnode, dbnode));
|
||||||
|
}
|
||||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||||
|
|
||||||
let mut rels: Vec<RelTag> = self
|
let mut rels: Vec<RelTag> = self
|
||||||
@@ -919,6 +920,9 @@ impl Timeline {
|
|||||||
result.add_key(AUX_FILES_KEY);
|
result.add_key(AUX_FILES_KEY);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add extra keyspaces in the test cases. Some test cases write keys into the storage without
|
||||||
|
// creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
|
||||||
|
// and the keys will not be garbage-colllected.
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
{
|
{
|
||||||
let guard = self.extra_test_dense_keyspace.load();
|
let guard = self.extra_test_dense_keyspace.load();
|
||||||
@@ -927,13 +931,48 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((
|
let dense_keyspace = result.to_keyspace();
|
||||||
result.to_keyspace(),
|
let sparse_keyspace = SparseKeySpace(KeySpace {
|
||||||
/* AUX sparse key space */
|
ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
|
||||||
SparseKeySpace(KeySpace {
|
});
|
||||||
ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
|
|
||||||
}),
|
if cfg!(debug_assertions) {
|
||||||
))
|
// Verify if the sparse keyspaces are ordered and non-overlapping.
|
||||||
|
|
||||||
|
// We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
|
||||||
|
// category of sparse keys are split into their own image/delta files. If there
|
||||||
|
// are overlapping keyspaces, they will be automatically merged by keyspace accum,
|
||||||
|
// and we want the developer to keep the keyspaces separated.
|
||||||
|
|
||||||
|
let ranges = &sparse_keyspace.0.ranges;
|
||||||
|
|
||||||
|
// TODO: use a single overlaps_with across the codebase
|
||||||
|
fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||||
|
!(a.end <= b.start || b.end <= a.start)
|
||||||
|
}
|
||||||
|
for i in 0..ranges.len() {
|
||||||
|
for j in 0..i {
|
||||||
|
if overlaps_with(&ranges[i], &ranges[j]) {
|
||||||
|
panic!(
|
||||||
|
"overlapping sparse keyspace: {}..{} and {}..{}",
|
||||||
|
ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i in 1..ranges.len() {
|
||||||
|
assert!(
|
||||||
|
ranges[i - 1].end <= ranges[i].start,
|
||||||
|
"unordered sparse keyspace: {}..{} and {}..{}",
|
||||||
|
ranges[i - 1].start,
|
||||||
|
ranges[i - 1].end,
|
||||||
|
ranges[i].start,
|
||||||
|
ranges[i].end
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((dense_keyspace, sparse_keyspace))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get cached size of relation if it not updated after specified LSN
|
/// Get cached size of relation if it not updated after specified LSN
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ use tokio::task::JoinSet;
|
|||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::backoff;
|
use utils::backoff;
|
||||||
|
use utils::circuit_breaker::CircuitBreaker;
|
||||||
use utils::completion;
|
use utils::completion;
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::failpoint_support;
|
use utils::failpoint_support;
|
||||||
@@ -73,9 +74,11 @@ use crate::deletion_queue::DeletionQueueClient;
|
|||||||
use crate::deletion_queue::DeletionQueueError;
|
use crate::deletion_queue::DeletionQueueError;
|
||||||
use crate::import_datadir;
|
use crate::import_datadir;
|
||||||
use crate::is_uninit_mark;
|
use crate::is_uninit_mark;
|
||||||
|
use crate::l0_flush::L0FlushGlobalState;
|
||||||
use crate::metrics::TENANT;
|
use crate::metrics::TENANT;
|
||||||
use crate::metrics::{
|
use crate::metrics::{
|
||||||
remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
|
||||||
|
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
||||||
};
|
};
|
||||||
use crate::repository::GcResult;
|
use crate::repository::GcResult;
|
||||||
use crate::task_mgr;
|
use crate::task_mgr;
|
||||||
@@ -166,6 +169,7 @@ pub struct TenantSharedResources {
|
|||||||
pub broker_client: storage_broker::BrokerClientChannel,
|
pub broker_client: storage_broker::BrokerClientChannel,
|
||||||
pub remote_storage: GenericRemoteStorage,
|
pub remote_storage: GenericRemoteStorage,
|
||||||
pub deletion_queue_client: DeletionQueueClient,
|
pub deletion_queue_client: DeletionQueueClient,
|
||||||
|
pub l0_flush_global_state: L0FlushGlobalState,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A [`Tenant`] is really an _attached_ tenant. The configuration
|
/// A [`Tenant`] is really an _attached_ tenant. The configuration
|
||||||
@@ -274,6 +278,10 @@ pub struct Tenant {
|
|||||||
|
|
||||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||||
|
|
||||||
|
/// Track repeated failures to compact, so that we can back off.
|
||||||
|
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
|
||||||
|
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
|
||||||
|
|
||||||
/// If the tenant is in Activating state, notify this to encourage it
|
/// If the tenant is in Activating state, notify this to encourage it
|
||||||
/// to proceed to Active as soon as possible, rather than waiting for lazy
|
/// to proceed to Active as soon as possible, rather than waiting for lazy
|
||||||
/// background warmup.
|
/// background warmup.
|
||||||
@@ -294,6 +302,8 @@ pub struct Tenant {
|
|||||||
|
|
||||||
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
|
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
|
||||||
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
|
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
|
||||||
|
|
||||||
|
l0_flush_global_state: L0FlushGlobalState,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for Tenant {
|
impl std::fmt::Debug for Tenant {
|
||||||
@@ -529,6 +539,15 @@ impl From<PageReconstructError> for GcError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
pub(crate) enum LoadConfigError {
|
||||||
|
#[error("TOML deserialization error: '{0}'")]
|
||||||
|
DeserializeToml(#[from] toml_edit::de::Error),
|
||||||
|
|
||||||
|
#[error("Config not found at {0}")]
|
||||||
|
NotFound(Utf8PathBuf),
|
||||||
|
}
|
||||||
|
|
||||||
impl Tenant {
|
impl Tenant {
|
||||||
/// Yet another helper for timeline initialization.
|
/// Yet another helper for timeline initialization.
|
||||||
///
|
///
|
||||||
@@ -667,6 +686,7 @@ impl Tenant {
|
|||||||
broker_client,
|
broker_client,
|
||||||
remote_storage,
|
remote_storage,
|
||||||
deletion_queue_client,
|
deletion_queue_client,
|
||||||
|
l0_flush_global_state,
|
||||||
} = resources;
|
} = resources;
|
||||||
|
|
||||||
let attach_mode = attached_conf.location.attach_mode;
|
let attach_mode = attached_conf.location.attach_mode;
|
||||||
@@ -681,6 +701,7 @@ impl Tenant {
|
|||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
remote_storage.clone(),
|
remote_storage.clone(),
|
||||||
deletion_queue_client,
|
deletion_queue_client,
|
||||||
|
l0_flush_global_state,
|
||||||
));
|
));
|
||||||
|
|
||||||
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
|
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
|
||||||
@@ -980,6 +1001,7 @@ impl Tenant {
|
|||||||
TimelineResources {
|
TimelineResources {
|
||||||
remote_client,
|
remote_client,
|
||||||
timeline_get_throttle: self.timeline_get_throttle.clone(),
|
timeline_get_throttle: self.timeline_get_throttle.clone(),
|
||||||
|
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||||
},
|
},
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
@@ -1349,7 +1371,7 @@ impl Tenant {
|
|||||||
initdb_lsn: Lsn,
|
initdb_lsn: Lsn,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
|
delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
|
||||||
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
|
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
|
||||||
end_lsn: Lsn,
|
end_lsn: Lsn,
|
||||||
) -> anyhow::Result<Arc<Timeline>> {
|
) -> anyhow::Result<Arc<Timeline>> {
|
||||||
@@ -1625,13 +1647,31 @@ impl Tenant {
|
|||||||
timelines_to_compact
|
timelines_to_compact
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Before doing any I/O work, check our circuit breaker
|
||||||
|
if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
|
||||||
|
info!("Skipping compaction due to previous failures");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
for (timeline_id, timeline) in &timelines_to_compact {
|
for (timeline_id, timeline) in &timelines_to_compact {
|
||||||
timeline
|
timeline
|
||||||
.compact(cancel, EnumSet::empty(), ctx)
|
.compact(cancel, EnumSet::empty(), ctx)
|
||||||
.instrument(info_span!("compact_timeline", %timeline_id))
|
.instrument(info_span!("compact_timeline", %timeline_id))
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
self.compaction_circuit_breaker
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.fail(&CIRCUIT_BREAKERS_BROKEN, &e);
|
||||||
|
e
|
||||||
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.compaction_circuit_breaker
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.success(&CIRCUIT_BREAKERS_UNBROKEN);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1800,9 +1840,15 @@ impl Tenant {
|
|||||||
// If we're still attaching, fire the cancellation token early to drop out: this
|
// If we're still attaching, fire the cancellation token early to drop out: this
|
||||||
// will prevent us flushing, but ensures timely shutdown if some I/O during attach
|
// will prevent us flushing, but ensures timely shutdown if some I/O during attach
|
||||||
// is very slow.
|
// is very slow.
|
||||||
if matches!(self.current_state(), TenantState::Attaching) {
|
let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) {
|
||||||
self.cancel.cancel();
|
self.cancel.cancel();
|
||||||
}
|
|
||||||
|
// Having fired our cancellation token, do not try and flush timelines: their cancellation tokens
|
||||||
|
// are children of ours, so their flush loops will have shut down already
|
||||||
|
timeline::ShutdownMode::Hard
|
||||||
|
} else {
|
||||||
|
shutdown_mode
|
||||||
|
};
|
||||||
|
|
||||||
match self.set_stopping(shutdown_progress, false, false).await {
|
match self.set_stopping(shutdown_progress, false, false).await {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
@@ -2319,13 +2365,6 @@ impl Tenant {
|
|||||||
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_trace_read_requests(&self) -> bool {
|
|
||||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
|
||||||
tenant_conf
|
|
||||||
.trace_read_requests
|
|
||||||
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_min_resident_size_override(&self) -> Option<u64> {
|
pub fn get_min_resident_size_override(&self) -> Option<u64> {
|
||||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
@@ -2469,6 +2508,7 @@ impl Tenant {
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
remote_storage: GenericRemoteStorage,
|
remote_storage: GenericRemoteStorage,
|
||||||
deletion_queue_client: DeletionQueueClient,
|
deletion_queue_client: DeletionQueueClient,
|
||||||
|
l0_flush_global_state: L0FlushGlobalState,
|
||||||
) -> Tenant {
|
) -> Tenant {
|
||||||
debug_assert!(
|
debug_assert!(
|
||||||
!attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
|
!attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
|
||||||
@@ -2547,6 +2587,14 @@ impl Tenant {
|
|||||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||||
|
compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
|
||||||
|
format!("compaction-{tenant_shard_id}"),
|
||||||
|
5,
|
||||||
|
// Compaction can be a very expensive operation, and might leak disk space. It also ought
|
||||||
|
// to be infallible, as long as remote storage is available. So if it repeatedly fails,
|
||||||
|
// use an extremely long backoff.
|
||||||
|
Some(Duration::from_secs(3600 * 24)),
|
||||||
|
)),
|
||||||
activate_now_sem: tokio::sync::Semaphore::new(0),
|
activate_now_sem: tokio::sync::Semaphore::new(0),
|
||||||
cancel: CancellationToken::default(),
|
cancel: CancellationToken::default(),
|
||||||
gate: Gate::default(),
|
gate: Gate::default(),
|
||||||
@@ -2556,6 +2604,7 @@ impl Tenant {
|
|||||||
)),
|
)),
|
||||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||||
|
l0_flush_global_state,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2563,36 +2612,35 @@ impl Tenant {
|
|||||||
pub(super) fn load_tenant_config(
|
pub(super) fn load_tenant_config(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_shard_id: &TenantShardId,
|
||||||
) -> anyhow::Result<LocationConf> {
|
) -> Result<LocationConf, LoadConfigError> {
|
||||||
let config_path = conf.tenant_location_config_path(tenant_shard_id);
|
let config_path = conf.tenant_location_config_path(tenant_shard_id);
|
||||||
|
|
||||||
if config_path.exists() {
|
info!("loading tenant configuration from {config_path}");
|
||||||
// New-style config takes precedence
|
|
||||||
let deserialized = Self::read_config(&config_path)?;
|
|
||||||
Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
|
|
||||||
} else {
|
|
||||||
// The config should almost always exist for a tenant directory:
|
|
||||||
// - When attaching a tenant, the config is the first thing we write
|
|
||||||
// - When detaching a tenant, we atomically move the directory to a tmp location
|
|
||||||
// before deleting contents.
|
|
||||||
//
|
|
||||||
// The very rare edge case that can result in a missing config is if we crash during attach
|
|
||||||
// between creating directory and writing config. Callers should handle that as if the
|
|
||||||
// directory didn't exist.
|
|
||||||
anyhow::bail!("tenant config not found in {}", config_path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_config(path: &Utf8Path) -> anyhow::Result<toml_edit::Document> {
|
|
||||||
info!("loading tenant configuration from {path}");
|
|
||||||
|
|
||||||
// load and parse file
|
// load and parse file
|
||||||
let config = fs::read_to_string(path)
|
let config = fs::read_to_string(&config_path).map_err(|e| {
|
||||||
.with_context(|| format!("Failed to load config from path '{path}'"))?;
|
match e.kind() {
|
||||||
|
std::io::ErrorKind::NotFound => {
|
||||||
|
// The config should almost always exist for a tenant directory:
|
||||||
|
// - When attaching a tenant, the config is the first thing we write
|
||||||
|
// - When detaching a tenant, we atomically move the directory to a tmp location
|
||||||
|
// before deleting contents.
|
||||||
|
//
|
||||||
|
// The very rare edge case that can result in a missing config is if we crash during attach
|
||||||
|
// between creating directory and writing config. Callers should handle that as if the
|
||||||
|
// directory didn't exist.
|
||||||
|
|
||||||
config
|
LoadConfigError::NotFound(config_path)
|
||||||
.parse::<toml_edit::Document>()
|
}
|
||||||
.with_context(|| format!("Failed to parse config from file '{path}' as toml file"))
|
_ => {
|
||||||
|
// No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues
|
||||||
|
// that we cannot cleanly recover
|
||||||
|
crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||||
@@ -2600,7 +2648,7 @@ impl Tenant {
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_shard_id: &TenantShardId,
|
||||||
location_conf: &LocationConf,
|
location_conf: &LocationConf,
|
||||||
) -> anyhow::Result<()> {
|
) -> std::io::Result<()> {
|
||||||
let config_path = conf.tenant_location_config_path(tenant_shard_id);
|
let config_path = conf.tenant_location_config_path(tenant_shard_id);
|
||||||
|
|
||||||
Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
|
Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
|
||||||
@@ -2611,7 +2659,7 @@ impl Tenant {
|
|||||||
tenant_shard_id: &TenantShardId,
|
tenant_shard_id: &TenantShardId,
|
||||||
config_path: &Utf8Path,
|
config_path: &Utf8Path,
|
||||||
location_conf: &LocationConf,
|
location_conf: &LocationConf,
|
||||||
) -> anyhow::Result<()> {
|
) -> std::io::Result<()> {
|
||||||
debug!("persisting tenantconf to {config_path}");
|
debug!("persisting tenantconf to {config_path}");
|
||||||
|
|
||||||
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
||||||
@@ -2620,22 +2668,20 @@ impl Tenant {
|
|||||||
.to_string();
|
.to_string();
|
||||||
|
|
||||||
fail::fail_point!("tenant-config-before-write", |_| {
|
fail::fail_point!("tenant-config-before-write", |_| {
|
||||||
anyhow::bail!("tenant-config-before-write");
|
Err(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::Other,
|
||||||
|
"tenant-config-before-write",
|
||||||
|
))
|
||||||
});
|
});
|
||||||
|
|
||||||
// Convert the config to a toml file.
|
// Convert the config to a toml file.
|
||||||
conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
|
conf_content +=
|
||||||
|
&toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed");
|
||||||
|
|
||||||
let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
|
let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
|
||||||
|
|
||||||
let tenant_shard_id = *tenant_shard_id;
|
|
||||||
let config_path = config_path.to_owned();
|
|
||||||
let conf_content = conf_content.into_bytes();
|
let conf_content = conf_content.into_bytes();
|
||||||
VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
|
VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await
|
||||||
.await
|
|
||||||
.with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -2853,6 +2899,7 @@ impl Tenant {
|
|||||||
{
|
{
|
||||||
let mut target = timeline.gc_info.write().unwrap();
|
let mut target = timeline.gc_info.write().unwrap();
|
||||||
|
|
||||||
|
// Cull any expired leases
|
||||||
let now = SystemTime::now();
|
let now = SystemTime::now();
|
||||||
target.leases.retain(|_, lease| !lease.is_expired(&now));
|
target.leases.retain(|_, lease| !lease.is_expired(&now));
|
||||||
|
|
||||||
@@ -2861,6 +2908,31 @@ impl Tenant {
|
|||||||
.valid_lsn_lease_count_gauge
|
.valid_lsn_lease_count_gauge
|
||||||
.set(target.leases.len() as u64);
|
.set(target.leases.len() as u64);
|
||||||
|
|
||||||
|
// Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
|
||||||
|
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
|
||||||
|
if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
|
||||||
|
target.within_ancestor_pitr =
|
||||||
|
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update metrics that depend on GC state
|
||||||
|
timeline
|
||||||
|
.metrics
|
||||||
|
.archival_size
|
||||||
|
.set(if target.within_ancestor_pitr {
|
||||||
|
timeline.metrics.current_logical_size_gauge.get()
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
});
|
||||||
|
timeline.metrics.pitr_history_size.set(
|
||||||
|
timeline
|
||||||
|
.get_last_record_lsn()
|
||||||
|
.checked_sub(target.cutoffs.pitr)
|
||||||
|
.unwrap_or(Lsn(0))
|
||||||
|
.0,
|
||||||
|
);
|
||||||
|
|
||||||
match gc_cutoffs.remove(&timeline.timeline_id) {
|
match gc_cutoffs.remove(&timeline.timeline_id) {
|
||||||
Some(cutoffs) => {
|
Some(cutoffs) => {
|
||||||
target.retain_lsns = branchpoints;
|
target.retain_lsns = branchpoints;
|
||||||
@@ -2912,7 +2984,7 @@ impl Tenant {
|
|||||||
dst_id: TimelineId,
|
dst_id: TimelineId,
|
||||||
ancestor_lsn: Option<Lsn>,
|
ancestor_lsn: Option<Lsn>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
|
delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
|
||||||
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
|
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
|
||||||
end_lsn: Lsn,
|
end_lsn: Lsn,
|
||||||
) -> anyhow::Result<Arc<Timeline>> {
|
) -> anyhow::Result<Arc<Timeline>> {
|
||||||
@@ -3296,6 +3368,7 @@ impl Tenant {
|
|||||||
TimelineResources {
|
TimelineResources {
|
||||||
remote_client,
|
remote_client,
|
||||||
timeline_get_throttle: self.timeline_get_throttle.clone(),
|
timeline_get_throttle: self.timeline_get_throttle.clone(),
|
||||||
|
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3632,6 +3705,7 @@ pub(crate) mod harness {
|
|||||||
use utils::logging;
|
use utils::logging;
|
||||||
|
|
||||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||||
|
use crate::l0_flush::L0FlushConfig;
|
||||||
use crate::walredo::apply_neon;
|
use crate::walredo::apply_neon;
|
||||||
use crate::{repository::Key, walrecord::NeonWalRecord};
|
use crate::{repository::Key, walrecord::NeonWalRecord};
|
||||||
|
|
||||||
@@ -3669,7 +3743,6 @@ pub(crate) mod harness {
|
|||||||
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
|
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
|
||||||
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
||||||
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
||||||
trace_read_requests: Some(tenant_conf.trace_read_requests),
|
|
||||||
eviction_policy: Some(tenant_conf.eviction_policy),
|
eviction_policy: Some(tenant_conf.eviction_policy),
|
||||||
min_resident_size_override: tenant_conf.min_resident_size_override,
|
min_resident_size_override: tenant_conf.min_resident_size_override,
|
||||||
evictions_low_residence_duration_metric_threshold: Some(
|
evictions_low_residence_duration_metric_threshold: Some(
|
||||||
@@ -3821,6 +3894,8 @@ pub(crate) mod harness {
|
|||||||
self.tenant_shard_id,
|
self.tenant_shard_id,
|
||||||
self.remote_storage.clone(),
|
self.remote_storage.clone(),
|
||||||
self.deletion_queue.new_client(),
|
self.deletion_queue.new_client(),
|
||||||
|
// TODO: ideally we should run all unit tests with both configs
|
||||||
|
L0FlushGlobalState::new(L0FlushConfig::default()),
|
||||||
));
|
));
|
||||||
|
|
||||||
let preload = tenant
|
let preload = tenant
|
||||||
@@ -3908,7 +3983,7 @@ mod tests {
|
|||||||
use storage_layer::PersistentLayerKey;
|
use storage_layer::PersistentLayerKey;
|
||||||
use tests::storage_layer::ValuesReconstructState;
|
use tests::storage_layer::ValuesReconstructState;
|
||||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||||
use timeline::GcInfo;
|
use timeline::{DeltaLayerTestDesc, GcInfo};
|
||||||
use utils::bin_ser::BeSer;
|
use utils::bin_ser::BeSer;
|
||||||
use utils::id::TenantId;
|
use utils::id::TenantId;
|
||||||
|
|
||||||
@@ -6204,27 +6279,6 @@ mod tests {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
async fn get_vectored_impl_wrapper(
|
|
||||||
tline: &Arc<Timeline>,
|
|
||||||
key: Key,
|
|
||||||
lsn: Lsn,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> Result<Option<Bytes>, GetVectoredError> {
|
|
||||||
let mut reconstruct_state = ValuesReconstructState::new();
|
|
||||||
let mut res = tline
|
|
||||||
.get_vectored_impl(
|
|
||||||
KeySpace::single(key..key.next()),
|
|
||||||
lsn,
|
|
||||||
&mut reconstruct_state,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
Ok(res.pop_last().map(|(k, v)| {
|
|
||||||
assert_eq!(k, key);
|
|
||||||
v.unwrap()
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
let lsn = Lsn(0x30);
|
let lsn = Lsn(0x30);
|
||||||
|
|
||||||
// test vectored get on parent timeline
|
// test vectored get on parent timeline
|
||||||
@@ -6300,27 +6354,6 @@ mod tests {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
async fn get_vectored_impl_wrapper(
|
|
||||||
tline: &Arc<Timeline>,
|
|
||||||
key: Key,
|
|
||||||
lsn: Lsn,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> Result<Option<Bytes>, GetVectoredError> {
|
|
||||||
let mut reconstruct_state = ValuesReconstructState::new();
|
|
||||||
let mut res = tline
|
|
||||||
.get_vectored_impl(
|
|
||||||
KeySpace::single(key..key.next()),
|
|
||||||
lsn,
|
|
||||||
&mut reconstruct_state,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
Ok(res.pop_last().map(|(k, v)| {
|
|
||||||
assert_eq!(k, key);
|
|
||||||
v.unwrap()
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
let lsn = Lsn(0x30);
|
let lsn = Lsn(0x30);
|
||||||
|
|
||||||
// test vectored get on parent timeline
|
// test vectored get on parent timeline
|
||||||
@@ -6396,9 +6429,18 @@ mod tests {
|
|||||||
&ctx,
|
&ctx,
|
||||||
// delta layers
|
// delta layers
|
||||||
vec![
|
vec![
|
||||||
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
Lsn(0x10)..Lsn(0x20),
|
||||||
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
||||||
|
),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
|
Lsn(0x20)..Lsn(0x30),
|
||||||
|
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||||
|
),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
|
Lsn(0x20)..Lsn(0x30),
|
||||||
|
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||||
|
),
|
||||||
],
|
],
|
||||||
// image layers
|
// image layers
|
||||||
vec![
|
vec![
|
||||||
@@ -6464,17 +6506,29 @@ mod tests {
|
|||||||
&ctx,
|
&ctx,
|
||||||
// delta layers
|
// delta layers
|
||||||
vec![
|
vec![
|
||||||
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
Lsn(0x10)..Lsn(0x20),
|
||||||
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
||||||
vec![
|
),
|
||||||
(key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
|
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
(key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
|
Lsn(0x20)..Lsn(0x30),
|
||||||
],
|
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||||
|
),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
|
Lsn(0x20)..Lsn(0x30),
|
||||||
|
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||||
|
),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
|
Lsn(0x30)..Lsn(0x40),
|
||||||
|
vec![
|
||||||
|
(key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
|
||||||
|
(key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
|
||||||
|
],
|
||||||
|
),
|
||||||
],
|
],
|
||||||
// image layers
|
// image layers
|
||||||
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
|
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
|
||||||
Lsn(0x30),
|
Lsn(0x40),
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -6497,7 +6551,7 @@ mod tests {
|
|||||||
|
|
||||||
// Image layers are created at last_record_lsn
|
// Image layers are created at last_record_lsn
|
||||||
let images = tline
|
let images = tline
|
||||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
.inspect_image_layers(Lsn(0x40), &ctx)
|
||||||
.await
|
.await
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@@ -6523,9 +6577,18 @@ mod tests {
|
|||||||
&ctx,
|
&ctx,
|
||||||
// delta layers
|
// delta layers
|
||||||
vec![
|
vec![
|
||||||
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
Lsn(0x10)..Lsn(0x20),
|
||||||
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
||||||
|
),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
|
Lsn(0x20)..Lsn(0x30),
|
||||||
|
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||||
|
),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
|
Lsn(0x20)..Lsn(0x30),
|
||||||
|
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||||
|
),
|
||||||
],
|
],
|
||||||
// image layers
|
// image layers
|
||||||
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
|
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
|
||||||
@@ -6573,15 +6636,21 @@ mod tests {
|
|||||||
key
|
key
|
||||||
}
|
}
|
||||||
|
|
||||||
// We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
|
// We create
|
||||||
|
// - one bottom-most image layer,
|
||||||
|
// - a delta layer D1 crossing the GC horizon with data below and above the horizon,
|
||||||
|
// - a delta layer D2 crossing the GC horizon with data only below the horizon,
|
||||||
|
// - a delta layer D3 above the horizon.
|
||||||
//
|
//
|
||||||
// | D1 | | D3 |
|
// | D3 |
|
||||||
|
// | D1 |
|
||||||
// -| |-- gc horizon -----------------
|
// -| |-- gc horizon -----------------
|
||||||
// | | | D2 |
|
// | | | D2 |
|
||||||
// --------- img layer ------------------
|
// --------- img layer ------------------
|
||||||
//
|
//
|
||||||
// What we should expact from this compaction is:
|
// What we should expact from this compaction is:
|
||||||
// | Part of D1 | | D3 |
|
// | D3 |
|
||||||
|
// | Part of D1 |
|
||||||
// --------- img layer with D1+D2 at GC horizon------------------
|
// --------- img layer with D1+D2 at GC horizon------------------
|
||||||
|
|
||||||
// img layer at 0x10
|
// img layer at 0x10
|
||||||
@@ -6621,13 +6690,13 @@ mod tests {
|
|||||||
let delta3 = vec![
|
let delta3 = vec![
|
||||||
(
|
(
|
||||||
get_key(8),
|
get_key(8),
|
||||||
Lsn(0x40),
|
Lsn(0x48),
|
||||||
Value::Image(Bytes::from("value 8@0x40")),
|
Value::Image(Bytes::from("value 8@0x48")),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
get_key(9),
|
get_key(9),
|
||||||
Lsn(0x40),
|
Lsn(0x48),
|
||||||
Value::Image(Bytes::from("value 9@0x40")),
|
Value::Image(Bytes::from("value 9@0x48")),
|
||||||
),
|
),
|
||||||
];
|
];
|
||||||
|
|
||||||
@@ -6637,7 +6706,11 @@ mod tests {
|
|||||||
Lsn(0x10),
|
Lsn(0x10),
|
||||||
DEFAULT_PG_VERSION,
|
DEFAULT_PG_VERSION,
|
||||||
&ctx,
|
&ctx,
|
||||||
vec![delta1, delta2, delta3], // delta layers
|
vec![
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
|
||||||
|
], // delta layers
|
||||||
vec![(Lsn(0x10), img_layer)], // image layers
|
vec![(Lsn(0x10), img_layer)], // image layers
|
||||||
Lsn(0x50),
|
Lsn(0x50),
|
||||||
)
|
)
|
||||||
@@ -6658,8 +6731,8 @@ mod tests {
|
|||||||
Bytes::from_static(b"value 5@0x20"),
|
Bytes::from_static(b"value 5@0x20"),
|
||||||
Bytes::from_static(b"value 6@0x20"),
|
Bytes::from_static(b"value 6@0x20"),
|
||||||
Bytes::from_static(b"value 7@0x10"),
|
Bytes::from_static(b"value 7@0x10"),
|
||||||
Bytes::from_static(b"value 8@0x40"),
|
Bytes::from_static(b"value 8@0x48"),
|
||||||
Bytes::from_static(b"value 9@0x40"),
|
Bytes::from_static(b"value 9@0x48"),
|
||||||
];
|
];
|
||||||
|
|
||||||
for (idx, expected) in expected_result.iter().enumerate() {
|
for (idx, expected) in expected_result.iter().enumerate() {
|
||||||
@@ -6747,10 +6820,10 @@ mod tests {
|
|||||||
lsn_range: Lsn(0x30)..Lsn(0x41),
|
lsn_range: Lsn(0x30)..Lsn(0x41),
|
||||||
is_delta: true
|
is_delta: true
|
||||||
},
|
},
|
||||||
// The delta layer we created and should not be picked for the compaction
|
// The delta3 layer that should not be picked for the compaction
|
||||||
PersistentLayerKey {
|
PersistentLayerKey {
|
||||||
key_range: get_key(8)..get_key(10),
|
key_range: get_key(8)..get_key(10),
|
||||||
lsn_range: Lsn(0x40)..Lsn(0x41),
|
lsn_range: Lsn(0x48)..Lsn(0x50),
|
||||||
is_delta: true
|
is_delta: true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -6814,7 +6887,10 @@ mod tests {
|
|||||||
Lsn(0x10),
|
Lsn(0x10),
|
||||||
DEFAULT_PG_VERSION,
|
DEFAULT_PG_VERSION,
|
||||||
&ctx,
|
&ctx,
|
||||||
vec![delta1], // delta layers
|
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||||
|
Lsn(0x10)..Lsn(0x40),
|
||||||
|
delta1,
|
||||||
|
)], // delta layers
|
||||||
vec![(Lsn(0x10), image1)], // image layers
|
vec![(Lsn(0x10), image1)], // image layers
|
||||||
Lsn(0x50),
|
Lsn(0x50),
|
||||||
)
|
)
|
||||||
@@ -6938,15 +7014,21 @@ mod tests {
|
|||||||
key
|
key
|
||||||
}
|
}
|
||||||
|
|
||||||
// We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
|
// We create
|
||||||
|
// - one bottom-most image layer,
|
||||||
|
// - a delta layer D1 crossing the GC horizon with data below and above the horizon,
|
||||||
|
// - a delta layer D2 crossing the GC horizon with data only below the horizon,
|
||||||
|
// - a delta layer D3 above the horizon.
|
||||||
//
|
//
|
||||||
// | D1 | | D3 |
|
// | D3 |
|
||||||
|
// | D1 |
|
||||||
// -| |-- gc horizon -----------------
|
// -| |-- gc horizon -----------------
|
||||||
// | | | D2 |
|
// | | | D2 |
|
||||||
// --------- img layer ------------------
|
// --------- img layer ------------------
|
||||||
//
|
//
|
||||||
// What we should expact from this compaction is:
|
// What we should expact from this compaction is:
|
||||||
// | Part of D1 | | D3 |
|
// | D3 |
|
||||||
|
// | Part of D1 |
|
||||||
// --------- img layer with D1+D2 at GC horizon------------------
|
// --------- img layer with D1+D2 at GC horizon------------------
|
||||||
|
|
||||||
// img layer at 0x10
|
// img layer at 0x10
|
||||||
@@ -6996,13 +7078,13 @@ mod tests {
|
|||||||
let delta3 = vec![
|
let delta3 = vec![
|
||||||
(
|
(
|
||||||
get_key(8),
|
get_key(8),
|
||||||
Lsn(0x40),
|
Lsn(0x48),
|
||||||
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
|
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
get_key(9),
|
get_key(9),
|
||||||
Lsn(0x40),
|
Lsn(0x48),
|
||||||
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
|
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||||
),
|
),
|
||||||
];
|
];
|
||||||
|
|
||||||
@@ -7012,7 +7094,11 @@ mod tests {
|
|||||||
Lsn(0x10),
|
Lsn(0x10),
|
||||||
DEFAULT_PG_VERSION,
|
DEFAULT_PG_VERSION,
|
||||||
&ctx,
|
&ctx,
|
||||||
vec![delta1, delta2, delta3], // delta layers
|
vec![
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
|
||||||
|
], // delta layers
|
||||||
vec![(Lsn(0x10), img_layer)], // image layers
|
vec![(Lsn(0x10), img_layer)], // image layers
|
||||||
Lsn(0x50),
|
Lsn(0x50),
|
||||||
)
|
)
|
||||||
@@ -7027,6 +7113,7 @@ mod tests {
|
|||||||
horizon: Lsn(0x30),
|
horizon: Lsn(0x30),
|
||||||
},
|
},
|
||||||
leases: Default::default(),
|
leases: Default::default(),
|
||||||
|
within_ancestor_pitr: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -7039,8 +7126,8 @@ mod tests {
|
|||||||
Bytes::from_static(b"value 5@0x10@0x20"),
|
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||||
Bytes::from_static(b"value 6@0x10@0x20"),
|
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||||
Bytes::from_static(b"value 7@0x10"),
|
Bytes::from_static(b"value 7@0x10"),
|
||||||
Bytes::from_static(b"value 8@0x10@0x40"),
|
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||||
Bytes::from_static(b"value 9@0x10@0x40"),
|
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||||
];
|
];
|
||||||
|
|
||||||
let expected_result_at_gc_horizon = [
|
let expected_result_at_gc_horizon = [
|
||||||
|
|||||||
@@ -6,13 +6,20 @@
|
|||||||
//! is written as a one byte. If it's larger than that, the length
|
//! is written as a one byte. If it's larger than that, the length
|
||||||
//! is written as a four-byte integer, in big-endian, with the high
|
//! is written as a four-byte integer, in big-endian, with the high
|
||||||
//! bit set. This way, we can detect whether it's 1- or 4-byte header
|
//! bit set. This way, we can detect whether it's 1- or 4-byte header
|
||||||
//! by peeking at the first byte.
|
//! by peeking at the first byte. For blobs larger than 128 bits,
|
||||||
|
//! we also specify three reserved bits, only one of the three bit
|
||||||
|
//! patterns is currently in use (0b011) and signifies compression
|
||||||
|
//! with zstd.
|
||||||
//!
|
//!
|
||||||
//! len < 128: 0XXXXXXX
|
//! len < 128: 0XXXXXXX
|
||||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||||
//!
|
//!
|
||||||
|
use async_compression::Level;
|
||||||
use bytes::{BufMut, BytesMut};
|
use bytes::{BufMut, BytesMut};
|
||||||
|
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||||
|
use tracing::warn;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::page_cache::PAGE_SZ;
|
use crate::page_cache::PAGE_SZ;
|
||||||
@@ -66,12 +73,37 @@ impl<'a> BlockCursor<'a> {
|
|||||||
len_buf.copy_from_slice(&buf[off..off + 4]);
|
len_buf.copy_from_slice(&buf[off..off + 4]);
|
||||||
off += 4;
|
off += 4;
|
||||||
}
|
}
|
||||||
len_buf[0] &= 0x7f;
|
let bit_mask = if self.read_compressed {
|
||||||
|
!LEN_COMPRESSION_BIT_MASK
|
||||||
|
} else {
|
||||||
|
0x7f
|
||||||
|
};
|
||||||
|
len_buf[0] &= bit_mask;
|
||||||
u32::from_be_bytes(len_buf) as usize
|
u32::from_be_bytes(len_buf) as usize
|
||||||
};
|
};
|
||||||
|
let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
|
||||||
|
|
||||||
dstbuf.clear();
|
let mut tmp_buf = Vec::new();
|
||||||
dstbuf.reserve(len);
|
let buf_to_write;
|
||||||
|
let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed {
|
||||||
|
if compression_bits > BYTE_UNCOMPRESSED {
|
||||||
|
warn!("reading key above future limit ({len} bytes)");
|
||||||
|
}
|
||||||
|
buf_to_write = dstbuf;
|
||||||
|
None
|
||||||
|
} else if compression_bits == BYTE_ZSTD {
|
||||||
|
buf_to_write = &mut tmp_buf;
|
||||||
|
Some(dstbuf)
|
||||||
|
} else {
|
||||||
|
let error = std::io::Error::new(
|
||||||
|
std::io::ErrorKind::InvalidData,
|
||||||
|
format!("invalid compression byte {compression_bits:x}"),
|
||||||
|
);
|
||||||
|
return Err(error);
|
||||||
|
};
|
||||||
|
|
||||||
|
buf_to_write.clear();
|
||||||
|
buf_to_write.reserve(len);
|
||||||
|
|
||||||
// Read the payload
|
// Read the payload
|
||||||
let mut remain = len;
|
let mut remain = len;
|
||||||
@@ -85,14 +117,35 @@ impl<'a> BlockCursor<'a> {
|
|||||||
page_remain = PAGE_SZ;
|
page_remain = PAGE_SZ;
|
||||||
}
|
}
|
||||||
let this_blk_len = min(remain, page_remain);
|
let this_blk_len = min(remain, page_remain);
|
||||||
dstbuf.extend_from_slice(&buf[off..off + this_blk_len]);
|
buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]);
|
||||||
remain -= this_blk_len;
|
remain -= this_blk_len;
|
||||||
off += this_blk_len;
|
off += this_blk_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(dstbuf) = compression {
|
||||||
|
if compression_bits == BYTE_ZSTD {
|
||||||
|
let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf);
|
||||||
|
decoder.write_all(buf_to_write).await?;
|
||||||
|
decoder.flush().await?;
|
||||||
|
} else {
|
||||||
|
unreachable!("already checked above")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reserved bits for length and compression
|
||||||
|
pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
|
||||||
|
|
||||||
|
/// The maximum size of blobs we support. The highest few bits
|
||||||
|
/// are reserved for compression and other further uses.
|
||||||
|
const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
|
||||||
|
|
||||||
|
pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
|
||||||
|
pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
|
||||||
|
|
||||||
/// A wrapper of `VirtualFile` that allows users to write blobs.
|
/// A wrapper of `VirtualFile` that allows users to write blobs.
|
||||||
///
|
///
|
||||||
/// If a `BlobWriter` is dropped, the internal buffer will be
|
/// If a `BlobWriter` is dropped, the internal buffer will be
|
||||||
@@ -219,6 +272,18 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
srcbuf: B,
|
srcbuf: B,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
|
) -> (B::Buf, Result<u64, Error>) {
|
||||||
|
self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write a blob of data. Returns the offset that it was written to,
|
||||||
|
/// which can be used to retrieve the data later.
|
||||||
|
pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||||
|
&mut self,
|
||||||
|
srcbuf: B,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
algorithm: ImageCompressionAlgorithm,
|
||||||
) -> (B::Buf, Result<u64, Error>) {
|
) -> (B::Buf, Result<u64, Error>) {
|
||||||
let offset = self.offset;
|
let offset = self.offset;
|
||||||
|
|
||||||
@@ -226,29 +291,60 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
|
|
||||||
let mut io_buf = self.io_buf.take().expect("we always put it back below");
|
let mut io_buf = self.io_buf.take().expect("we always put it back below");
|
||||||
io_buf.clear();
|
io_buf.clear();
|
||||||
let (io_buf, hdr_res) = async {
|
let mut compressed_buf = None;
|
||||||
|
let ((io_buf, hdr_res), srcbuf) = async {
|
||||||
if len < 128 {
|
if len < 128 {
|
||||||
// Short blob. Write a 1-byte length header
|
// Short blob. Write a 1-byte length header
|
||||||
io_buf.put_u8(len as u8);
|
io_buf.put_u8(len as u8);
|
||||||
self.write_all(io_buf, ctx).await
|
(
|
||||||
|
self.write_all(io_buf, ctx).await,
|
||||||
|
srcbuf.slice_full().into_inner(),
|
||||||
|
)
|
||||||
} else {
|
} else {
|
||||||
// Write a 4-byte length header
|
// Write a 4-byte length header
|
||||||
if len > 0x7fff_ffff {
|
if len > MAX_SUPPORTED_LEN {
|
||||||
return (
|
return (
|
||||||
io_buf,
|
(
|
||||||
Err(Error::new(
|
io_buf,
|
||||||
ErrorKind::Other,
|
Err(Error::new(
|
||||||
format!("blob too large ({len} bytes)"),
|
ErrorKind::Other,
|
||||||
)),
|
format!("blob too large ({len} bytes)"),
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
srcbuf.slice_full().into_inner(),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if len > 0x0fff_ffff {
|
let (high_bit_mask, len_written, srcbuf) = match algorithm {
|
||||||
tracing::warn!("writing blob above future limit ({len} bytes)");
|
ImageCompressionAlgorithm::Zstd { level } => {
|
||||||
}
|
let mut encoder = if let Some(level) = level {
|
||||||
let mut len_buf = (len as u32).to_be_bytes();
|
async_compression::tokio::write::ZstdEncoder::with_quality(
|
||||||
len_buf[0] |= 0x80;
|
Vec::new(),
|
||||||
|
Level::Precise(level.into()),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
async_compression::tokio::write::ZstdEncoder::new(Vec::new())
|
||||||
|
};
|
||||||
|
let slice = srcbuf.slice_full();
|
||||||
|
encoder.write_all(&slice[..]).await.unwrap();
|
||||||
|
encoder.shutdown().await.unwrap();
|
||||||
|
let compressed = encoder.into_inner();
|
||||||
|
if compressed.len() < len {
|
||||||
|
let compressed_len = compressed.len();
|
||||||
|
compressed_buf = Some(compressed);
|
||||||
|
(BYTE_ZSTD, compressed_len, slice.into_inner())
|
||||||
|
} else {
|
||||||
|
(BYTE_UNCOMPRESSED, len, slice.into_inner())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ImageCompressionAlgorithm::Disabled => {
|
||||||
|
(BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let mut len_buf = (len_written as u32).to_be_bytes();
|
||||||
|
assert_eq!(len_buf[0] & 0xf0, 0);
|
||||||
|
len_buf[0] |= high_bit_mask;
|
||||||
io_buf.extend_from_slice(&len_buf[..]);
|
io_buf.extend_from_slice(&len_buf[..]);
|
||||||
self.write_all(io_buf, ctx).await
|
(self.write_all(io_buf, ctx).await, srcbuf)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.await;
|
.await;
|
||||||
@@ -257,7 +353,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
Ok(_) => (),
|
Ok(_) => (),
|
||||||
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
|
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
|
||||||
}
|
}
|
||||||
let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
|
let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
|
||||||
|
let (_buf, res) = self.write_all(compressed_buf, ctx).await;
|
||||||
|
(Slice::into_inner(srcbuf.slice(..)), res)
|
||||||
|
} else {
|
||||||
|
self.write_all(srcbuf, ctx).await
|
||||||
|
};
|
||||||
(srcbuf, res.map(|_| offset))
|
(srcbuf, res.map(|_| offset))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -289,37 +390,65 @@ impl BlobWriter<false> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
pub(crate) mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
|
use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
|
||||||
|
use camino::Utf8PathBuf;
|
||||||
|
use camino_tempfile::Utf8TempDir;
|
||||||
use rand::{Rng, SeedableRng};
|
use rand::{Rng, SeedableRng};
|
||||||
|
|
||||||
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
|
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
|
||||||
|
round_trip_test_compressed::<BUFFERED>(blobs, false).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
|
||||||
|
blobs: &[Vec<u8>],
|
||||||
|
compression: bool,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
|
||||||
let temp_dir = camino_tempfile::tempdir()?;
|
let temp_dir = camino_tempfile::tempdir()?;
|
||||||
let pathbuf = temp_dir.path().join("file");
|
let pathbuf = temp_dir.path().join("file");
|
||||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
|
||||||
|
|
||||||
// Write part (in block to drop the file)
|
// Write part (in block to drop the file)
|
||||||
let mut offsets = Vec::new();
|
let mut offsets = Vec::new();
|
||||||
{
|
{
|
||||||
let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
|
let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
|
||||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||||
for blob in blobs.iter() {
|
for blob in blobs.iter() {
|
||||||
let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
|
let (_, res) = if compression {
|
||||||
|
wtr.write_blob_maybe_compressed(
|
||||||
|
blob.clone(),
|
||||||
|
ctx,
|
||||||
|
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
} else {
|
||||||
|
wtr.write_blob(blob.clone(), ctx).await
|
||||||
|
};
|
||||||
let offs = res?;
|
let offs = res?;
|
||||||
offsets.push(offs);
|
offsets.push(offs);
|
||||||
}
|
}
|
||||||
// Write out one page worth of zeros so that we can
|
// Write out one page worth of zeros so that we can
|
||||||
// read again with read_blk
|
// read again with read_blk
|
||||||
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
|
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
|
||||||
let offs = res?;
|
let offs = res?;
|
||||||
println!("Writing final blob at offs={offs}");
|
println!("Writing final blob at offs={offs}");
|
||||||
wtr.flush_buffer(&ctx).await?;
|
wtr.flush_buffer(ctx).await?;
|
||||||
}
|
}
|
||||||
|
Ok((temp_dir, pathbuf, offsets))
|
||||||
|
}
|
||||||
|
|
||||||
let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
|
async fn round_trip_test_compressed<const BUFFERED: bool>(
|
||||||
|
blobs: &[Vec<u8>],
|
||||||
|
compression: bool,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||||
|
let (_temp_dir, pathbuf, offsets) =
|
||||||
|
write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
|
||||||
|
|
||||||
|
let file = VirtualFile::open(pathbuf, &ctx).await?;
|
||||||
let rdr = BlockReaderRef::VirtualFile(&file);
|
let rdr = BlockReaderRef::VirtualFile(&file);
|
||||||
let rdr = BlockCursor::new(rdr);
|
let rdr = BlockCursor::new_with_compression(rdr, compression);
|
||||||
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
|
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
|
||||||
let blob_read = rdr.read_blob(*offset, &ctx).await?;
|
let blob_read = rdr.read_blob(*offset, &ctx).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -330,7 +459,7 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn random_array(len: usize) -> Vec<u8> {
|
pub(crate) fn random_array(len: usize) -> Vec<u8> {
|
||||||
let mut rng = rand::thread_rng();
|
let mut rng = rand::thread_rng();
|
||||||
(0..len).map(|_| rng.gen()).collect::<_>()
|
(0..len).map(|_| rng.gen()).collect::<_>()
|
||||||
}
|
}
|
||||||
@@ -353,6 +482,8 @@ mod tests {
|
|||||||
];
|
];
|
||||||
round_trip_test::<false>(blobs).await?;
|
round_trip_test::<false>(blobs).await?;
|
||||||
round_trip_test::<true>(blobs).await?;
|
round_trip_test::<true>(blobs).await?;
|
||||||
|
round_trip_test_compressed::<false>(blobs, true).await?;
|
||||||
|
round_trip_test_compressed::<true>(blobs, true).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -361,10 +492,15 @@ mod tests {
|
|||||||
let blobs = &[
|
let blobs = &[
|
||||||
b"test".to_vec(),
|
b"test".to_vec(),
|
||||||
random_array(10 * PAGE_SZ),
|
random_array(10 * PAGE_SZ),
|
||||||
|
b"hello".to_vec(),
|
||||||
|
random_array(66 * PAGE_SZ),
|
||||||
|
vec![0xf3; 24 * PAGE_SZ],
|
||||||
b"foobar".to_vec(),
|
b"foobar".to_vec(),
|
||||||
];
|
];
|
||||||
round_trip_test::<false>(blobs).await?;
|
round_trip_test::<false>(blobs).await?;
|
||||||
round_trip_test::<true>(blobs).await?;
|
round_trip_test::<true>(blobs).await?;
|
||||||
|
round_trip_test_compressed::<false>(blobs, true).await?;
|
||||||
|
round_trip_test_compressed::<true>(blobs, true).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ where
|
|||||||
pub enum BlockLease<'a> {
|
pub enum BlockLease<'a> {
|
||||||
PageReadGuard(PageReadGuard<'static>),
|
PageReadGuard(PageReadGuard<'static>),
|
||||||
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
|
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
|
||||||
|
Slice(&'a [u8; PAGE_SZ]),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
|
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -63,6 +64,7 @@ impl<'a> Deref for BlockLease<'a> {
|
|||||||
match self {
|
match self {
|
||||||
BlockLease::PageReadGuard(v) => v.deref(),
|
BlockLease::PageReadGuard(v) => v.deref(),
|
||||||
BlockLease::EphemeralFileMutableTail(v) => v,
|
BlockLease::EphemeralFileMutableTail(v) => v,
|
||||||
|
BlockLease::Slice(v) => v,
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
BlockLease::Arc(v) => v.deref(),
|
BlockLease::Arc(v) => v.deref(),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -81,6 +83,7 @@ pub(crate) enum BlockReaderRef<'a> {
|
|||||||
FileBlockReader(&'a FileBlockReader<'a>),
|
FileBlockReader(&'a FileBlockReader<'a>),
|
||||||
EphemeralFile(&'a EphemeralFile),
|
EphemeralFile(&'a EphemeralFile),
|
||||||
Adapter(Adapter<&'a DeltaLayerInner>),
|
Adapter(Adapter<&'a DeltaLayerInner>),
|
||||||
|
Slice(&'a [u8]),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -99,6 +102,7 @@ impl<'a> BlockReaderRef<'a> {
|
|||||||
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
|
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
|
||||||
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
|
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
|
||||||
Adapter(r) => r.read_blk(blknum, ctx).await,
|
Adapter(r) => r.read_blk(blknum, ctx).await,
|
||||||
|
Slice(s) => Self::read_blk_slice(s, blknum),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
TestDisk(r) => r.read_blk(blknum),
|
TestDisk(r) => r.read_blk(blknum),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -107,6 +111,24 @@ impl<'a> BlockReaderRef<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> BlockReaderRef<'a> {
|
||||||
|
fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
|
||||||
|
let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
|
||||||
|
let end = start.checked_add(PAGE_SZ).unwrap();
|
||||||
|
if end > slice.len() {
|
||||||
|
return Err(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::UnexpectedEof,
|
||||||
|
format!("slice too short, len={} end={}", slice.len(), end),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
let slice = &slice[start..end];
|
||||||
|
let page_sized: &[u8; PAGE_SZ] = slice
|
||||||
|
.try_into()
|
||||||
|
.expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
|
||||||
|
Ok(BlockLease::Slice(page_sized))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
||||||
///
|
///
|
||||||
@@ -127,16 +149,24 @@ impl<'a> BlockReaderRef<'a> {
|
|||||||
/// ```
|
/// ```
|
||||||
///
|
///
|
||||||
pub struct BlockCursor<'a> {
|
pub struct BlockCursor<'a> {
|
||||||
|
pub(super) read_compressed: bool,
|
||||||
reader: BlockReaderRef<'a>,
|
reader: BlockReaderRef<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> BlockCursor<'a> {
|
impl<'a> BlockCursor<'a> {
|
||||||
pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
|
pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
|
||||||
BlockCursor { reader }
|
Self::new_with_compression(reader, false)
|
||||||
|
}
|
||||||
|
pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self {
|
||||||
|
BlockCursor {
|
||||||
|
read_compressed,
|
||||||
|
reader,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Needed by cli
|
// Needed by cli
|
||||||
pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
|
pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
|
||||||
BlockCursor {
|
BlockCursor {
|
||||||
|
read_compressed: false,
|
||||||
reader: BlockReaderRef::FileBlockReader(reader),
|
reader: BlockReaderRef::FileBlockReader(reader),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -166,11 +196,17 @@ pub struct FileBlockReader<'a> {
|
|||||||
|
|
||||||
/// Unique ID of this file, used as key in the page cache.
|
/// Unique ID of this file, used as key in the page cache.
|
||||||
file_id: page_cache::FileId,
|
file_id: page_cache::FileId,
|
||||||
|
|
||||||
|
compressed_reads: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> FileBlockReader<'a> {
|
impl<'a> FileBlockReader<'a> {
|
||||||
pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
|
pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
|
||||||
FileBlockReader { file_id, file }
|
FileBlockReader {
|
||||||
|
file_id,
|
||||||
|
file,
|
||||||
|
compressed_reads: true,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read a page from the underlying file into given buffer.
|
/// Read a page from the underlying file into given buffer.
|
||||||
@@ -217,7 +253,10 @@ impl<'a> FileBlockReader<'a> {
|
|||||||
|
|
||||||
impl BlockReader for FileBlockReader<'_> {
|
impl BlockReader for FileBlockReader<'_> {
|
||||||
fn block_cursor(&self) -> BlockCursor<'_> {
|
fn block_cursor(&self) -> BlockCursor<'_> {
|
||||||
BlockCursor::new(BlockReaderRef::FileBlockReader(self))
|
BlockCursor::new_with_compression(
|
||||||
|
BlockReaderRef::FileBlockReader(self),
|
||||||
|
self.compressed_reads,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -335,7 +335,6 @@ pub struct TenantConf {
|
|||||||
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
|
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
|
||||||
/// to avoid eager reconnects.
|
/// to avoid eager reconnects.
|
||||||
pub max_lsn_wal_lag: NonZeroU64,
|
pub max_lsn_wal_lag: NonZeroU64,
|
||||||
pub trace_read_requests: bool,
|
|
||||||
pub eviction_policy: EvictionPolicy,
|
pub eviction_policy: EvictionPolicy,
|
||||||
pub min_resident_size_override: Option<u64>,
|
pub min_resident_size_override: Option<u64>,
|
||||||
// See the corresponding metric's help string.
|
// See the corresponding metric's help string.
|
||||||
@@ -436,10 +435,6 @@ pub struct TenantConfOpt {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||||
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
|
||||||
#[serde(default)]
|
|
||||||
pub trace_read_requests: Option<bool>,
|
|
||||||
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub eviction_policy: Option<EvictionPolicy>,
|
pub eviction_policy: Option<EvictionPolicy>,
|
||||||
@@ -519,9 +514,6 @@ impl TenantConfOpt {
|
|||||||
.lagging_wal_timeout
|
.lagging_wal_timeout
|
||||||
.unwrap_or(global_conf.lagging_wal_timeout),
|
.unwrap_or(global_conf.lagging_wal_timeout),
|
||||||
max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
|
max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
|
||||||
trace_read_requests: self
|
|
||||||
.trace_read_requests
|
|
||||||
.unwrap_or(global_conf.trace_read_requests),
|
|
||||||
eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
|
eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
|
||||||
min_resident_size_override: self
|
min_resident_size_override: self
|
||||||
.min_resident_size_override
|
.min_resident_size_override
|
||||||
@@ -581,7 +573,6 @@ impl Default for TenantConf {
|
|||||||
.expect("cannot parse default walreceiver lagging wal timeout"),
|
.expect("cannot parse default walreceiver lagging wal timeout"),
|
||||||
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||||
trace_read_requests: false,
|
|
||||||
eviction_policy: EvictionPolicy::NoEviction,
|
eviction_policy: EvictionPolicy::NoEviction,
|
||||||
min_resident_size_override: None,
|
min_resident_size_override: None,
|
||||||
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
||||||
@@ -659,7 +650,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
|||||||
walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
|
walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
|
||||||
lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
|
lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
|
||||||
max_lsn_wal_lag: value.max_lsn_wal_lag,
|
max_lsn_wal_lag: value.max_lsn_wal_lag,
|
||||||
trace_read_requests: value.trace_read_requests,
|
|
||||||
eviction_policy: value.eviction_policy,
|
eviction_policy: value.eviction_policy,
|
||||||
min_resident_size_override: value.min_resident_size_override,
|
min_resident_size_override: value.min_resident_size_override,
|
||||||
evictions_low_residence_duration_metric_threshold: value
|
evictions_low_residence_duration_metric_threshold: value
|
||||||
|
|||||||
@@ -550,10 +550,10 @@ where
|
|||||||
/// We maintain the length of the stack to be always greater than zero.
|
/// We maintain the length of the stack to be always greater than zero.
|
||||||
/// Two exceptions are:
|
/// Two exceptions are:
|
||||||
/// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
|
/// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
|
||||||
/// So because other methods cannot see the intermediate state invariant still holds.
|
/// So because other methods cannot see the intermediate state invariant still holds.
|
||||||
/// 2. `Self::finish`. It consumes self and does not return it back,
|
/// 2. `Self::finish`. It consumes self and does not return it back,
|
||||||
/// which means that this is where the structure is destroyed.
|
/// which means that this is where the structure is destroyed.
|
||||||
/// Thus stack of zero length cannot be observed by other methods.
|
/// Thus stack of zero length cannot be observed by other methods.
|
||||||
stack: Vec<BuildNode<L>>,
|
stack: Vec<BuildNode<L>>,
|
||||||
|
|
||||||
/// Last key that was appended to the tree. Used to sanity check that append
|
/// Last key that was appended to the tree. Used to sanity check that append
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ pub struct EphemeralFile {
|
|||||||
}
|
}
|
||||||
|
|
||||||
mod page_caching;
|
mod page_caching;
|
||||||
|
pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
|
||||||
mod zero_padded_read_write;
|
mod zero_padded_read_write;
|
||||||
|
|
||||||
impl EphemeralFile {
|
impl EphemeralFile {
|
||||||
@@ -53,7 +54,7 @@ impl EphemeralFile {
|
|||||||
Ok(EphemeralFile {
|
Ok(EphemeralFile {
|
||||||
_tenant_shard_id: tenant_shard_id,
|
_tenant_shard_id: tenant_shard_id,
|
||||||
_timeline_id: timeline_id,
|
_timeline_id: timeline_id,
|
||||||
rw: page_caching::RW::new(file),
|
rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -65,6 +66,11 @@ impl EphemeralFile {
|
|||||||
self.rw.page_cache_file_id()
|
self.rw.page_cache_file_id()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// See [`self::page_caching::RW::load_to_vec`].
|
||||||
|
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
||||||
|
self.rw.load_to_vec(ctx).await
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) async fn read_blk(
|
pub(crate) async fn read_blk(
|
||||||
&self,
|
&self,
|
||||||
blknum: u32,
|
blknum: u32,
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ use crate::virtual_file::VirtualFile;
|
|||||||
|
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use std::io::{self, ErrorKind};
|
use std::io::{self, ErrorKind};
|
||||||
|
use std::ops::{Deref, Range};
|
||||||
use tokio_epoll_uring::BoundedBuf;
|
use tokio_epoll_uring::BoundedBuf;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
@@ -19,14 +20,23 @@ pub struct RW {
|
|||||||
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
|
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
|
||||||
|
/// should we pre-warm the [`crate::page_cache`] with the contents?
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum PrewarmOnWrite {
|
||||||
|
Yes,
|
||||||
|
No,
|
||||||
|
}
|
||||||
|
|
||||||
impl RW {
|
impl RW {
|
||||||
pub fn new(file: VirtualFile) -> Self {
|
pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
|
||||||
let page_cache_file_id = page_cache::next_file_id();
|
let page_cache_file_id = page_cache::next_file_id();
|
||||||
Self {
|
Self {
|
||||||
page_cache_file_id,
|
page_cache_file_id,
|
||||||
rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
|
rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
|
||||||
page_cache_file_id,
|
page_cache_file_id,
|
||||||
file,
|
file,
|
||||||
|
prewarm_on_write,
|
||||||
)),
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -49,6 +59,43 @@ impl RW {
|
|||||||
self.rw.bytes_written()
|
self.rw.bytes_written()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
|
||||||
|
///
|
||||||
|
/// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
|
||||||
|
/// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
|
||||||
|
pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
||||||
|
// round up to the next PAGE_SZ multiple, required by blob_io
|
||||||
|
let size = {
|
||||||
|
let s = usize::try_from(self.bytes_written()).unwrap();
|
||||||
|
if s % PAGE_SZ == 0 {
|
||||||
|
s
|
||||||
|
} else {
|
||||||
|
s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let vec = Vec::with_capacity(size);
|
||||||
|
|
||||||
|
// read from disk what we've already flushed
|
||||||
|
let writer = self.rw.as_writer();
|
||||||
|
let flushed_range = writer.written_range();
|
||||||
|
let mut vec = writer
|
||||||
|
.file
|
||||||
|
.read_exact_at(
|
||||||
|
vec.slice(0..(flushed_range.end - flushed_range.start)),
|
||||||
|
u64::try_from(flushed_range.start).unwrap(),
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
.into_inner();
|
||||||
|
|
||||||
|
// copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
|
||||||
|
let buffered = self.rw.get_tail_zero_padded();
|
||||||
|
vec.extend_from_slice(buffered);
|
||||||
|
assert_eq!(vec.len(), size);
|
||||||
|
assert_eq!(vec.len() % PAGE_SZ, 0);
|
||||||
|
Ok(vec)
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) async fn read_blk(
|
pub(crate) async fn read_blk(
|
||||||
&self,
|
&self,
|
||||||
blknum: u32,
|
blknum: u32,
|
||||||
@@ -116,19 +163,40 @@ impl Drop for RW {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct PreWarmingWriter {
|
struct PreWarmingWriter {
|
||||||
|
prewarm_on_write: PrewarmOnWrite,
|
||||||
nwritten_blocks: u32,
|
nwritten_blocks: u32,
|
||||||
page_cache_file_id: page_cache::FileId,
|
page_cache_file_id: page_cache::FileId,
|
||||||
file: VirtualFile,
|
file: VirtualFile,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PreWarmingWriter {
|
impl PreWarmingWriter {
|
||||||
fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
|
fn new(
|
||||||
|
page_cache_file_id: page_cache::FileId,
|
||||||
|
file: VirtualFile,
|
||||||
|
prewarm_on_write: PrewarmOnWrite,
|
||||||
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
prewarm_on_write,
|
||||||
nwritten_blocks: 0,
|
nwritten_blocks: 0,
|
||||||
page_cache_file_id,
|
page_cache_file_id,
|
||||||
file,
|
file,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the byte range within `file` that has been written though `write_all`.
|
||||||
|
///
|
||||||
|
/// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
|
||||||
|
fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
|
||||||
|
let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
|
||||||
|
struct Wrapper(Range<usize>);
|
||||||
|
impl Deref for Wrapper {
|
||||||
|
type Target = Range<usize>;
|
||||||
|
fn deref(&self) -> &Range<usize> {
|
||||||
|
&self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Wrapper(0..nwritten_blocks * PAGE_SZ)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
|
impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
|
||||||
@@ -178,45 +246,51 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
|
|||||||
assert_eq!(&check_bounds_stuff_works, &*buf);
|
assert_eq!(&check_bounds_stuff_works, &*buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pre-warm page cache with the contents.
|
|
||||||
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
|
|
||||||
// benefits the code that writes InMemoryLayer=>L0 layers.
|
|
||||||
let nblocks = buflen / PAGE_SZ;
|
let nblocks = buflen / PAGE_SZ;
|
||||||
let nblocks32 = u32::try_from(nblocks).unwrap();
|
let nblocks32 = u32::try_from(nblocks).unwrap();
|
||||||
let cache = page_cache::get();
|
|
||||||
static CTX: Lazy<RequestContext> = Lazy::new(|| {
|
if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
|
||||||
RequestContext::new(
|
// Pre-warm page cache with the contents.
|
||||||
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
|
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
|
||||||
crate::context::DownloadBehavior::Error,
|
// benefits the code that writes InMemoryLayer=>L0 layers.
|
||||||
)
|
|
||||||
});
|
let cache = page_cache::get();
|
||||||
for blknum_in_buffer in 0..nblocks {
|
static CTX: Lazy<RequestContext> = Lazy::new(|| {
|
||||||
let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
|
RequestContext::new(
|
||||||
let blknum = self
|
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
|
||||||
.nwritten_blocks
|
crate::context::DownloadBehavior::Error,
|
||||||
.checked_add(blknum_in_buffer as u32)
|
)
|
||||||
.unwrap();
|
});
|
||||||
match cache
|
for blknum_in_buffer in 0..nblocks {
|
||||||
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
|
let blk_in_buffer =
|
||||||
.await
|
&buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
|
||||||
{
|
let blknum = self
|
||||||
Err(e) => {
|
.nwritten_blocks
|
||||||
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
|
.checked_add(blknum_in_buffer as u32)
|
||||||
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
|
.unwrap();
|
||||||
}
|
match cache
|
||||||
Ok(v) => match v {
|
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
|
||||||
page_cache::ReadBufResult::Found(_guard) => {
|
.await
|
||||||
// This function takes &mut self, so, it shouldn't be possible to reach this point.
|
{
|
||||||
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
|
Err(e) => {
|
||||||
|
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
|
||||||
|
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
|
||||||
|
}
|
||||||
|
Ok(v) => match v {
|
||||||
|
page_cache::ReadBufResult::Found(_guard) => {
|
||||||
|
// This function takes &mut self, so, it shouldn't be possible to reach this point.
|
||||||
|
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
|
||||||
and this function takes &mut self, so, no concurrent read_blk is possible");
|
and this function takes &mut self, so, no concurrent read_blk is possible");
|
||||||
}
|
}
|
||||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||||
write_guard.copy_from_slice(blk_in_buffer);
|
write_guard.copy_from_slice(blk_in_buffer);
|
||||||
let _ = write_guard.mark_valid();
|
let _ = write_guard.mark_valid();
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
|
self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
|
||||||
Ok((buflen, buf.into_inner()))
|
Ok((buflen, buf.into_inner()))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -75,6 +75,21 @@ where
|
|||||||
flushed_offset + u64::try_from(buffer.pending()).unwrap()
|
flushed_offset + u64::try_from(buffer.pending()).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
|
||||||
|
pub fn get_tail_zero_padded(&self) -> &[u8] {
|
||||||
|
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
||||||
|
let buffer_written_up_to = buffer.pending();
|
||||||
|
// pad to next page boundary
|
||||||
|
let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
|
||||||
|
buffer_written_up_to
|
||||||
|
} else {
|
||||||
|
buffer_written_up_to
|
||||||
|
.checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
|
||||||
|
.unwrap()
|
||||||
|
};
|
||||||
|
&buffer.as_zero_padded_slice()[0..read_up_to]
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
|
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
|
||||||
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
||||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
||||||
|
|||||||
@@ -43,7 +43,8 @@ use crate::tenant::config::{
|
|||||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||||
use crate::tenant::storage_layer::inmemory_layer;
|
use crate::tenant::storage_layer::inmemory_layer;
|
||||||
use crate::tenant::timeline::ShutdownMode;
|
use crate::tenant::timeline::ShutdownMode;
|
||||||
use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
|
use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
|
||||||
|
use crate::virtual_file::MaybeFatalIo;
|
||||||
use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
|
use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
|
||||||
|
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
@@ -272,7 +273,7 @@ pub struct TenantManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn emergency_generations(
|
fn emergency_generations(
|
||||||
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
|
tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
|
||||||
) -> HashMap<TenantShardId, TenantStartupMode> {
|
) -> HashMap<TenantShardId, TenantStartupMode> {
|
||||||
tenant_confs
|
tenant_confs
|
||||||
.iter()
|
.iter()
|
||||||
@@ -296,7 +297,7 @@ fn emergency_generations(
|
|||||||
|
|
||||||
async fn init_load_generations(
|
async fn init_load_generations(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
|
tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
|
||||||
resources: &TenantSharedResources,
|
resources: &TenantSharedResources,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
|
) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
|
||||||
@@ -346,56 +347,32 @@ async fn init_load_generations(
|
|||||||
/// Given a directory discovered in the pageserver's tenants/ directory, attempt
|
/// Given a directory discovered in the pageserver's tenants/ directory, attempt
|
||||||
/// to load a tenant config from it.
|
/// to load a tenant config from it.
|
||||||
///
|
///
|
||||||
/// If file is missing, return Ok(None)
|
/// If we cleaned up something expected (like an empty dir or a temp dir), return None.
|
||||||
fn load_tenant_config(
|
fn load_tenant_config(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
dentry: Utf8DirEntry,
|
dentry: Utf8DirEntry,
|
||||||
) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
|
) -> Option<Result<LocationConf, LoadConfigError>> {
|
||||||
let tenant_dir_path = dentry.path().to_path_buf();
|
let tenant_dir_path = dentry.path().to_path_buf();
|
||||||
if crate::is_temporary(&tenant_dir_path) {
|
if crate::is_temporary(&tenant_dir_path) {
|
||||||
info!("Found temporary tenant directory, removing: {tenant_dir_path}");
|
info!("Found temporary tenant directory, removing: {tenant_dir_path}");
|
||||||
// No need to use safe_remove_tenant_dir_all because this is already
|
// No need to use safe_remove_tenant_dir_all because this is already
|
||||||
// a temporary path
|
// a temporary path
|
||||||
if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
|
std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir");
|
||||||
error!(
|
return None;
|
||||||
"Failed to remove temporary directory '{}': {:?}",
|
|
||||||
tenant_dir_path, e
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return Ok(None);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// This case happens if we crash during attachment before writing a config into the dir
|
// This case happens if we crash during attachment before writing a config into the dir
|
||||||
let is_empty = tenant_dir_path
|
let is_empty = tenant_dir_path
|
||||||
.is_empty_dir()
|
.is_empty_dir()
|
||||||
.with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?;
|
.fatal_err("Checking for empty tenant dir");
|
||||||
if is_empty {
|
if is_empty {
|
||||||
info!("removing empty tenant directory {tenant_dir_path:?}");
|
info!("removing empty tenant directory {tenant_dir_path:?}");
|
||||||
if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
|
std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir");
|
||||||
error!(
|
return None;
|
||||||
"Failed to remove empty tenant directory '{}': {e:#}",
|
|
||||||
tenant_dir_path
|
|
||||||
)
|
|
||||||
}
|
|
||||||
return Ok(None);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let tenant_shard_id = match tenant_dir_path
|
Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
|
||||||
.file_name()
|
|
||||||
.unwrap_or_default()
|
|
||||||
.parse::<TenantShardId>()
|
|
||||||
{
|
|
||||||
Ok(id) => id,
|
|
||||||
Err(_) => {
|
|
||||||
warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",);
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(Some((
|
|
||||||
tenant_shard_id,
|
|
||||||
Tenant::load_tenant_config(conf, &tenant_shard_id),
|
|
||||||
)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initial stage of load: walk the local tenants directory, clean up any temp files,
|
/// Initial stage of load: walk the local tenants directory, clean up any temp files,
|
||||||
@@ -405,32 +382,51 @@ fn load_tenant_config(
|
|||||||
/// seconds even on reasonably fast drives.
|
/// seconds even on reasonably fast drives.
|
||||||
async fn init_load_tenant_configs(
|
async fn init_load_tenant_configs(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
|
) -> HashMap<TenantShardId, Result<LocationConf, LoadConfigError>> {
|
||||||
let tenants_dir = conf.tenants_path();
|
let tenants_dir = conf.tenants_path();
|
||||||
|
|
||||||
let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
|
let dentries = tokio::task::spawn_blocking(move || -> Vec<Utf8DirEntry> {
|
||||||
let dir_entries = tenants_dir
|
let context = format!("read tenants dir {tenants_dir}");
|
||||||
.read_dir_utf8()
|
let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context);
|
||||||
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
|
|
||||||
|
|
||||||
Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
|
dir_entries
|
||||||
|
.collect::<Result<Vec<_>, std::io::Error>>()
|
||||||
|
.fatal_err(&context)
|
||||||
})
|
})
|
||||||
.await??;
|
.await
|
||||||
|
.expect("Config load task panicked");
|
||||||
|
|
||||||
let mut configs = HashMap::new();
|
let mut configs = HashMap::new();
|
||||||
|
|
||||||
let mut join_set = JoinSet::new();
|
let mut join_set = JoinSet::new();
|
||||||
for dentry in dentries {
|
for dentry in dentries {
|
||||||
join_set.spawn_blocking(move || load_tenant_config(conf, dentry));
|
let tenant_shard_id = match dentry.file_name().parse::<TenantShardId>() {
|
||||||
|
Ok(id) => id,
|
||||||
|
Err(_) => {
|
||||||
|
warn!(
|
||||||
|
"Invalid tenant path (garbage in our repo directory?): '{}'",
|
||||||
|
dentry.file_name()
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
join_set.spawn_blocking(move || {
|
||||||
|
(
|
||||||
|
tenant_shard_id,
|
||||||
|
load_tenant_config(conf, tenant_shard_id, dentry),
|
||||||
|
)
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
while let Some(r) = join_set.join_next().await {
|
while let Some(r) = join_set.join_next().await {
|
||||||
if let Some((tenant_id, tenant_config)) = r?? {
|
let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task");
|
||||||
configs.insert(tenant_id, tenant_config);
|
if let Some(tenant_config) = tenant_config {
|
||||||
|
configs.insert(tenant_shard_id, tenant_config);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(configs)
|
configs
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
@@ -472,7 +468,7 @@ pub async fn init_tenant_mgr(
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Scan local filesystem for attached tenants
|
// Scan local filesystem for attached tenants
|
||||||
let tenant_configs = init_load_tenant_configs(conf).await?;
|
let tenant_configs = init_load_tenant_configs(conf).await;
|
||||||
|
|
||||||
// Determine which tenants are to be secondary or attached, and in which generation
|
// Determine which tenants are to be secondary or attached, and in which generation
|
||||||
let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
||||||
@@ -590,31 +586,23 @@ pub async fn init_tenant_mgr(
|
|||||||
);
|
);
|
||||||
// For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
|
// For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
|
||||||
for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
|
for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
|
||||||
// Errors writing configs are fatal
|
// Writing a config to local disk is foundational to startup up tenants: panic if we can't.
|
||||||
config_write_result?;
|
config_write_result.fatal_err("write tenant shard config file");
|
||||||
|
|
||||||
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||||
let shard_identity = location_conf.shard;
|
let shard_identity = location_conf.shard;
|
||||||
let slot = match location_conf.mode {
|
let slot = match location_conf.mode {
|
||||||
LocationMode::Attached(attached_conf) => {
|
LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
|
||||||
match tenant_spawn(
|
conf,
|
||||||
conf,
|
tenant_shard_id,
|
||||||
tenant_shard_id,
|
&tenant_dir_path,
|
||||||
&tenant_dir_path,
|
resources.clone(),
|
||||||
resources.clone(),
|
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
shard_identity,
|
||||||
shard_identity,
|
Some(init_order.clone()),
|
||||||
Some(init_order.clone()),
|
SpawnMode::Lazy,
|
||||||
SpawnMode::Lazy,
|
&ctx,
|
||||||
&ctx,
|
)),
|
||||||
) {
|
|
||||||
Ok(tenant) => TenantSlot::Attached(tenant),
|
|
||||||
Err(e) => {
|
|
||||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LocationMode::Secondary(secondary_conf) => {
|
LocationMode::Secondary(secondary_conf) => {
|
||||||
info!(
|
info!(
|
||||||
tenant_id = %tenant_shard_id.tenant_id,
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
@@ -649,8 +637,7 @@ pub async fn init_tenant_mgr(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
|
/// Wrapper for Tenant::spawn that checks invariants before running
|
||||||
/// a broken tenant in the map if Tenant::spawn fails.
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
fn tenant_spawn(
|
fn tenant_spawn(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
@@ -662,23 +649,18 @@ fn tenant_spawn(
|
|||||||
init_order: Option<InitializationOrder>,
|
init_order: Option<InitializationOrder>,
|
||||||
mode: SpawnMode,
|
mode: SpawnMode,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Arc<Tenant>> {
|
) -> Arc<Tenant> {
|
||||||
anyhow::ensure!(
|
// All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
|
||||||
tenant_path.is_dir(),
|
// path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode
|
||||||
"Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
|
// to avoid impacting prod runtime performance.
|
||||||
);
|
assert!(!crate::is_temporary(tenant_path));
|
||||||
anyhow::ensure!(
|
debug_assert!(tenant_path.is_dir());
|
||||||
!crate::is_temporary(tenant_path),
|
debug_assert!(conf
|
||||||
"Cannot load tenant from temporary path {tenant_path:?}"
|
.tenant_location_config_path(&tenant_shard_id)
|
||||||
);
|
.try_exists()
|
||||||
anyhow::ensure!(
|
.unwrap());
|
||||||
!tenant_path.is_empty_dir().with_context(|| {
|
|
||||||
format!("Failed to check whether {tenant_path:?} is an empty dir")
|
|
||||||
})?,
|
|
||||||
"Cannot load tenant from empty directory {tenant_path:?}"
|
|
||||||
);
|
|
||||||
|
|
||||||
let tenant = Tenant::spawn(
|
Tenant::spawn(
|
||||||
conf,
|
conf,
|
||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
resources,
|
resources,
|
||||||
@@ -687,9 +669,7 @@ fn tenant_spawn(
|
|||||||
init_order,
|
init_order,
|
||||||
mode,
|
mode,
|
||||||
ctx,
|
ctx,
|
||||||
);
|
)
|
||||||
|
|
||||||
Ok(tenant)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||||
@@ -840,8 +820,9 @@ pub(crate) enum UpsertLocationError {
|
|||||||
#[error("Failed to flush: {0}")]
|
#[error("Failed to flush: {0}")]
|
||||||
Flush(anyhow::Error),
|
Flush(anyhow::Error),
|
||||||
|
|
||||||
|
/// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state.
|
||||||
#[error("Internal error: {0}")]
|
#[error("Internal error: {0}")]
|
||||||
Other(#[from] anyhow::Error),
|
InternalError(anyhow::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TenantManager {
|
impl TenantManager {
|
||||||
@@ -971,7 +952,8 @@ impl TenantManager {
|
|||||||
match fast_path_taken {
|
match fast_path_taken {
|
||||||
Some(FastPathModified::Attached(tenant)) => {
|
Some(FastPathModified::Attached(tenant)) => {
|
||||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||||
.await?;
|
.await
|
||||||
|
.fatal_err("write tenant shard config");
|
||||||
|
|
||||||
// Transition to AttachedStale means we may well hold a valid generation
|
// Transition to AttachedStale means we may well hold a valid generation
|
||||||
// still, and have been requested to go stale as part of a migration. If
|
// still, and have been requested to go stale as part of a migration. If
|
||||||
@@ -1001,7 +983,8 @@ impl TenantManager {
|
|||||||
}
|
}
|
||||||
Some(FastPathModified::Secondary(_secondary_tenant)) => {
|
Some(FastPathModified::Secondary(_secondary_tenant)) => {
|
||||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||||
.await?;
|
.await
|
||||||
|
.fatal_err("write tenant shard config");
|
||||||
|
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
@@ -1067,7 +1050,7 @@ impl TenantManager {
|
|||||||
Some(TenantSlot::InProgress(_)) => {
|
Some(TenantSlot::InProgress(_)) => {
|
||||||
// This should never happen: acquire_slot should error out
|
// This should never happen: acquire_slot should error out
|
||||||
// if the contents of a slot were InProgress.
|
// if the contents of a slot were InProgress.
|
||||||
return Err(UpsertLocationError::Other(anyhow::anyhow!(
|
return Err(UpsertLocationError::InternalError(anyhow::anyhow!(
|
||||||
"Acquired an InProgress slot, this is a bug."
|
"Acquired an InProgress slot, this is a bug."
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
@@ -1086,12 +1069,14 @@ impl TenantManager {
|
|||||||
// Does not need to be fsync'd because local storage is just a cache.
|
// Does not need to be fsync'd because local storage is just a cache.
|
||||||
tokio::fs::create_dir_all(&timelines_path)
|
tokio::fs::create_dir_all(&timelines_path)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
.fatal_err("create timelines/ dir");
|
||||||
|
|
||||||
// Before activating either secondary or attached mode, persist the
|
// Before activating either secondary or attached mode, persist the
|
||||||
// configuration, so that on restart we will re-attach (or re-start
|
// configuration, so that on restart we will re-attach (or re-start
|
||||||
// secondary) on the tenant.
|
// secondary) on the tenant.
|
||||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;
|
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||||
|
.await
|
||||||
|
.fatal_err("write tenant shard config");
|
||||||
|
|
||||||
let new_slot = match &new_location_config.mode {
|
let new_slot = match &new_location_config.mode {
|
||||||
LocationMode::Secondary(secondary_config) => {
|
LocationMode::Secondary(secondary_config) => {
|
||||||
@@ -1110,13 +1095,15 @@ impl TenantManager {
|
|||||||
// from upserts. This enables creating generation-less tenants even though neon_local
|
// from upserts. This enables creating generation-less tenants even though neon_local
|
||||||
// always uses generations when calling the location conf API.
|
// always uses generations when calling the location conf API.
|
||||||
let attached_conf = if cfg!(feature = "testing") {
|
let attached_conf = if cfg!(feature = "testing") {
|
||||||
let mut conf = AttachedTenantConf::try_from(new_location_config)?;
|
let mut conf = AttachedTenantConf::try_from(new_location_config)
|
||||||
|
.map_err(UpsertLocationError::BadRequest)?;
|
||||||
if self.conf.control_plane_api.is_none() {
|
if self.conf.control_plane_api.is_none() {
|
||||||
conf.location.generation = Generation::none();
|
conf.location.generation = Generation::none();
|
||||||
}
|
}
|
||||||
conf
|
conf
|
||||||
} else {
|
} else {
|
||||||
AttachedTenantConf::try_from(new_location_config)?
|
AttachedTenantConf::try_from(new_location_config)
|
||||||
|
.map_err(UpsertLocationError::BadRequest)?
|
||||||
};
|
};
|
||||||
|
|
||||||
let tenant = tenant_spawn(
|
let tenant = tenant_spawn(
|
||||||
@@ -1129,7 +1116,7 @@ impl TenantManager {
|
|||||||
None,
|
None,
|
||||||
spawn_mode,
|
spawn_mode,
|
||||||
ctx,
|
ctx,
|
||||||
)?;
|
);
|
||||||
|
|
||||||
TenantSlot::Attached(tenant)
|
TenantSlot::Attached(tenant)
|
||||||
}
|
}
|
||||||
@@ -1143,7 +1130,7 @@ impl TenantManager {
|
|||||||
|
|
||||||
match slot_guard.upsert(new_slot) {
|
match slot_guard.upsert(new_slot) {
|
||||||
Err(TenantSlotUpsertError::InternalError(e)) => {
|
Err(TenantSlotUpsertError::InternalError(e)) => {
|
||||||
Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
|
Err(UpsertLocationError::InternalError(anyhow::anyhow!(e)))
|
||||||
}
|
}
|
||||||
Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
|
Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
|
||||||
Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
|
Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
|
||||||
@@ -1250,7 +1237,7 @@ impl TenantManager {
|
|||||||
None,
|
None,
|
||||||
SpawnMode::Eager,
|
SpawnMode::Eager,
|
||||||
ctx,
|
ctx,
|
||||||
)?;
|
);
|
||||||
|
|
||||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||||
|
|
||||||
@@ -1984,7 +1971,7 @@ impl TenantManager {
|
|||||||
None,
|
None,
|
||||||
SpawnMode::Eager,
|
SpawnMode::Eager,
|
||||||
ctx,
|
ctx,
|
||||||
)?;
|
);
|
||||||
|
|
||||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||||
|
|
||||||
|
|||||||
@@ -519,7 +519,7 @@ impl RemoteTimelineClient {
|
|||||||
local_path: &Utf8Path,
|
local_path: &Utf8Path,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<u64> {
|
) -> Result<u64, DownloadError> {
|
||||||
let downloaded_size = {
|
let downloaded_size = {
|
||||||
let _unfinished_gauge_guard = self.metrics.call_begin(
|
let _unfinished_gauge_guard = self.metrics.call_begin(
|
||||||
&RemoteOpFileKind::Layer,
|
&RemoteOpFileKind::Layer,
|
||||||
|
|||||||
@@ -23,6 +23,8 @@ use super::{
|
|||||||
storage_layer::LayerName,
|
storage_layer::LayerName,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
|
||||||
|
use metrics::UIntGauge;
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
models,
|
models,
|
||||||
shard::{ShardIdentity, TenantShardId},
|
shard::{ShardIdentity, TenantShardId},
|
||||||
@@ -99,6 +101,17 @@ pub(crate) struct SecondaryTenant {
|
|||||||
|
|
||||||
// Public state indicating overall progress of downloads relative to the last heatmap seen
|
// Public state indicating overall progress of downloads relative to the last heatmap seen
|
||||||
pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
|
pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
|
||||||
|
|
||||||
|
// Sum of layer sizes on local disk
|
||||||
|
pub(super) resident_size_metric: UIntGauge,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for SecondaryTenant {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
let tenant_id = self.tenant_shard_id.tenant_id.to_string();
|
||||||
|
let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
|
||||||
|
let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SecondaryTenant {
|
impl SecondaryTenant {
|
||||||
@@ -108,6 +121,12 @@ impl SecondaryTenant {
|
|||||||
tenant_conf: TenantConfOpt,
|
tenant_conf: TenantConfOpt,
|
||||||
config: &SecondaryLocationConfig,
|
config: &SecondaryLocationConfig,
|
||||||
) -> Arc<Self> {
|
) -> Arc<Self> {
|
||||||
|
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||||
|
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||||
|
let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
|
||||||
|
.get_metric_with_label_values(&[&tenant_id, &shard_id])
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
Arc::new(Self {
|
Arc::new(Self {
|
||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
// todo: shall we make this a descendent of the
|
// todo: shall we make this a descendent of the
|
||||||
@@ -123,6 +142,8 @@ impl SecondaryTenant {
|
|||||||
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
|
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
|
||||||
|
|
||||||
progress: std::sync::Mutex::default(),
|
progress: std::sync::Mutex::default(),
|
||||||
|
|
||||||
|
resident_size_metric,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -211,16 +232,12 @@ impl SecondaryTenant {
|
|||||||
// have to 100% match what is on disk, because it's a best-effort warming
|
// have to 100% match what is on disk, because it's a best-effort warming
|
||||||
// of the cache.
|
// of the cache.
|
||||||
let mut detail = this.detail.lock().unwrap();
|
let mut detail = this.detail.lock().unwrap();
|
||||||
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
|
if let Some(removed) =
|
||||||
let removed = timeline_detail.on_disk_layers.remove(&name);
|
detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
|
||||||
|
{
|
||||||
// We might race with removal of the same layer during downloads, if it was removed
|
// We might race with removal of the same layer during downloads, so finding the layer we
|
||||||
// from the heatmap. If we see that the OnDiskState is gone, then no need to
|
// were trying to remove is optional. Only issue the disk I/O to remove it if we found it.
|
||||||
// do a physical deletion or store in evicted_at.
|
removed.remove_blocking();
|
||||||
if let Some(removed) = removed {
|
|
||||||
removed.remove_blocking();
|
|
||||||
timeline_detail.evicted_at.insert(name, now);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ use crate::tenant::{
|
|||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||||
use futures::Future;
|
use futures::Future;
|
||||||
|
use metrics::UIntGauge;
|
||||||
use pageserver_api::models::SecondaryProgress;
|
use pageserver_api::models::SecondaryProgress;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
|
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
|
||||||
@@ -131,16 +132,66 @@ impl OnDiskState {
|
|||||||
.or_else(fs_ext::ignore_not_found)
|
.or_else(fs_ext::ignore_not_found)
|
||||||
.fatal_err("Deleting secondary layer")
|
.fatal_err("Deleting secondary layer")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn file_size(&self) -> u64 {
|
||||||
|
self.metadata.file_size
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default)]
|
#[derive(Debug, Clone, Default)]
|
||||||
pub(super) struct SecondaryDetailTimeline {
|
pub(super) struct SecondaryDetailTimeline {
|
||||||
pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
|
on_disk_layers: HashMap<LayerName, OnDiskState>,
|
||||||
|
|
||||||
/// We remember when layers were evicted, to prevent re-downloading them.
|
/// We remember when layers were evicted, to prevent re-downloading them.
|
||||||
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
|
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl SecondaryDetailTimeline {
|
||||||
|
pub(super) fn remove_layer(
|
||||||
|
&mut self,
|
||||||
|
name: &LayerName,
|
||||||
|
resident_metric: &UIntGauge,
|
||||||
|
) -> Option<OnDiskState> {
|
||||||
|
let removed = self.on_disk_layers.remove(name);
|
||||||
|
if let Some(removed) = &removed {
|
||||||
|
resident_metric.sub(removed.file_size());
|
||||||
|
}
|
||||||
|
removed
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `local_path`
|
||||||
|
fn touch_layer<F>(
|
||||||
|
&mut self,
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
tenant_shard_id: &TenantShardId,
|
||||||
|
timeline_id: &TimelineId,
|
||||||
|
touched: &HeatMapLayer,
|
||||||
|
resident_metric: &UIntGauge,
|
||||||
|
local_path: F,
|
||||||
|
) where
|
||||||
|
F: FnOnce() -> Utf8PathBuf,
|
||||||
|
{
|
||||||
|
use std::collections::hash_map::Entry;
|
||||||
|
match self.on_disk_layers.entry(touched.name.clone()) {
|
||||||
|
Entry::Occupied(mut v) => {
|
||||||
|
v.get_mut().access_time = touched.access_time;
|
||||||
|
}
|
||||||
|
Entry::Vacant(e) => {
|
||||||
|
e.insert(OnDiskState::new(
|
||||||
|
conf,
|
||||||
|
tenant_shard_id,
|
||||||
|
timeline_id,
|
||||||
|
touched.name.clone(),
|
||||||
|
touched.metadata.clone(),
|
||||||
|
touched.access_time,
|
||||||
|
local_path(),
|
||||||
|
));
|
||||||
|
resident_metric.add(touched.metadata.file_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Aspects of a heatmap that we remember after downloading it
|
// Aspects of a heatmap that we remember after downloading it
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
struct DownloadSummary {
|
struct DownloadSummary {
|
||||||
@@ -158,7 +209,7 @@ pub(super) struct SecondaryDetail {
|
|||||||
|
|
||||||
last_download: Option<DownloadSummary>,
|
last_download: Option<DownloadSummary>,
|
||||||
next_download: Option<Instant>,
|
next_download: Option<Instant>,
|
||||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper for logging SystemTime
|
/// Helper for logging SystemTime
|
||||||
@@ -191,6 +242,38 @@ impl SecondaryDetail {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(super) fn evict_layer(
|
||||||
|
&mut self,
|
||||||
|
name: LayerName,
|
||||||
|
timeline_id: &TimelineId,
|
||||||
|
now: SystemTime,
|
||||||
|
resident_metric: &UIntGauge,
|
||||||
|
) -> Option<OnDiskState> {
|
||||||
|
let timeline = self.timelines.get_mut(timeline_id)?;
|
||||||
|
let removed = timeline.remove_layer(&name, resident_metric);
|
||||||
|
if removed.is_some() {
|
||||||
|
timeline.evicted_at.insert(name, now);
|
||||||
|
}
|
||||||
|
removed
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(super) fn remove_timeline(
|
||||||
|
&mut self,
|
||||||
|
timeline_id: &TimelineId,
|
||||||
|
resident_metric: &UIntGauge,
|
||||||
|
) {
|
||||||
|
let removed = self.timelines.remove(timeline_id);
|
||||||
|
if let Some(removed) = removed {
|
||||||
|
resident_metric.sub(
|
||||||
|
removed
|
||||||
|
.on_disk_layers
|
||||||
|
.values()
|
||||||
|
.map(|l| l.metadata.file_size)
|
||||||
|
.sum(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Additionally returns the total number of layers, used for more stable relative access time
|
/// Additionally returns the total number of layers, used for more stable relative access time
|
||||||
/// based eviction.
|
/// based eviction.
|
||||||
pub(super) fn get_layers_for_eviction(
|
pub(super) fn get_layers_for_eviction(
|
||||||
@@ -601,8 +684,13 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
Some(t) => t,
|
Some(t) => t,
|
||||||
None => {
|
None => {
|
||||||
// We have no existing state: need to scan local disk for layers first.
|
// We have no existing state: need to scan local disk for layers first.
|
||||||
let timeline_state =
|
let timeline_state = init_timeline_state(
|
||||||
init_timeline_state(self.conf, tenant_shard_id, timeline).await;
|
self.conf,
|
||||||
|
tenant_shard_id,
|
||||||
|
timeline,
|
||||||
|
&self.secondary_state.resident_size_metric,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
// Re-acquire detail lock now that we're done with async load from local FS
|
// Re-acquire detail lock now that we're done with async load from local FS
|
||||||
self.secondary_state
|
self.secondary_state
|
||||||
@@ -671,6 +759,25 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Metrics consistency check in testing builds
|
||||||
|
if cfg!(feature = "testing") {
|
||||||
|
let detail = self.secondary_state.detail.lock().unwrap();
|
||||||
|
let resident_size = detail
|
||||||
|
.timelines
|
||||||
|
.values()
|
||||||
|
.map(|tl| {
|
||||||
|
tl.on_disk_layers
|
||||||
|
.values()
|
||||||
|
.map(|v| v.metadata.file_size)
|
||||||
|
.sum::<u64>()
|
||||||
|
})
|
||||||
|
.sum::<u64>();
|
||||||
|
assert_eq!(
|
||||||
|
resident_size,
|
||||||
|
self.secondary_state.resident_size_metric.get()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Only update last_etag after a full successful download: this way will not skip
|
// Only update last_etag after a full successful download: this way will not skip
|
||||||
// the next download, even if the heatmap's actual etag is unchanged.
|
// the next download, even if the heatmap's actual etag is unchanged.
|
||||||
self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
|
self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
|
||||||
@@ -783,7 +890,7 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
for delete_timeline in &delete_timelines {
|
for delete_timeline in &delete_timelines {
|
||||||
// We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
|
// We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
|
||||||
// from disk fails that will be a fatal error.
|
// from disk fails that will be a fatal error.
|
||||||
detail.timelines.remove(delete_timeline);
|
detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -801,7 +908,7 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
|
let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
timeline_state.on_disk_layers.remove(&layer_name);
|
timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
|
||||||
}
|
}
|
||||||
|
|
||||||
for timeline_id in delete_timelines {
|
for timeline_id in delete_timelines {
|
||||||
@@ -1000,33 +1107,24 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
let timeline_detail = detail.timelines.entry(timeline_id).or_default();
|
let timeline_detail = detail.timelines.entry(timeline_id).or_default();
|
||||||
|
|
||||||
tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
|
tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
|
||||||
|
touched.into_iter().for_each(|t| {
|
||||||
for t in touched {
|
timeline_detail.touch_layer(
|
||||||
use std::collections::hash_map::Entry;
|
self.conf,
|
||||||
match timeline_detail.on_disk_layers.entry(t.name.clone()) {
|
tenant_shard_id,
|
||||||
Entry::Occupied(mut v) => {
|
&timeline_id,
|
||||||
v.get_mut().access_time = t.access_time;
|
&t,
|
||||||
}
|
&self.secondary_state.resident_size_metric,
|
||||||
Entry::Vacant(e) => {
|
|| {
|
||||||
let local_path = local_layer_path(
|
local_layer_path(
|
||||||
self.conf,
|
self.conf,
|
||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
&timeline_id,
|
&timeline_id,
|
||||||
&t.name,
|
&t.name,
|
||||||
&t.metadata.generation,
|
&t.metadata.generation,
|
||||||
);
|
)
|
||||||
e.insert(OnDiskState::new(
|
},
|
||||||
self.conf,
|
)
|
||||||
tenant_shard_id,
|
});
|
||||||
&timeline_id,
|
|
||||||
t.name,
|
|
||||||
t.metadata.clone(),
|
|
||||||
t.access_time,
|
|
||||||
local_path,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
result
|
result
|
||||||
@@ -1135,6 +1233,7 @@ async fn init_timeline_state(
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_shard_id: &TenantShardId,
|
||||||
heatmap: &HeatMapTimeline,
|
heatmap: &HeatMapTimeline,
|
||||||
|
resident_metric: &UIntGauge,
|
||||||
) -> SecondaryDetailTimeline {
|
) -> SecondaryDetailTimeline {
|
||||||
let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
|
let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
|
||||||
let mut detail = SecondaryDetailTimeline::default();
|
let mut detail = SecondaryDetailTimeline::default();
|
||||||
@@ -1210,17 +1309,13 @@ async fn init_timeline_state(
|
|||||||
} else {
|
} else {
|
||||||
// We expect the access time to be initialized immediately afterwards, when
|
// We expect the access time to be initialized immediately afterwards, when
|
||||||
// the latest heatmap is applied to the state.
|
// the latest heatmap is applied to the state.
|
||||||
detail.on_disk_layers.insert(
|
detail.touch_layer(
|
||||||
name.clone(),
|
conf,
|
||||||
OnDiskState::new(
|
tenant_shard_id,
|
||||||
conf,
|
&heatmap.timeline_id,
|
||||||
tenant_shard_id,
|
remote_meta,
|
||||||
&heatmap.timeline_id,
|
resident_metric,
|
||||||
name,
|
|| file_path,
|
||||||
remote_meta.metadata.clone(),
|
|
||||||
remote_meta.access_time,
|
|
||||||
file_path,
|
|
||||||
),
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use tenant_size_model::svg::SvgBranchKind;
|
||||||
use tokio::sync::oneshot::error::RecvError;
|
use tokio::sync::oneshot::error::RecvError;
|
||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
@@ -87,6 +88,9 @@ impl SegmentMeta {
|
|||||||
LsnKind::BranchPoint => true,
|
LsnKind::BranchPoint => true,
|
||||||
LsnKind::GcCutOff => true,
|
LsnKind::GcCutOff => true,
|
||||||
LsnKind::BranchEnd => false,
|
LsnKind::BranchEnd => false,
|
||||||
|
LsnKind::LeasePoint => true,
|
||||||
|
LsnKind::LeaseStart => false,
|
||||||
|
LsnKind::LeaseEnd => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -103,6 +107,21 @@ pub enum LsnKind {
|
|||||||
GcCutOff,
|
GcCutOff,
|
||||||
/// Last record LSN
|
/// Last record LSN
|
||||||
BranchEnd,
|
BranchEnd,
|
||||||
|
/// A LSN lease is granted here.
|
||||||
|
LeasePoint,
|
||||||
|
/// A lease starts from here.
|
||||||
|
LeaseStart,
|
||||||
|
/// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]).
|
||||||
|
LeaseEnd,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<LsnKind> for SvgBranchKind {
|
||||||
|
fn from(kind: LsnKind) -> Self {
|
||||||
|
match kind {
|
||||||
|
LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease,
|
||||||
|
_ => SvgBranchKind::Timeline,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
|
/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
|
||||||
@@ -124,6 +143,9 @@ pub struct TimelineInputs {
|
|||||||
|
|
||||||
/// Cutoff point calculated from the user-supplied 'max_retention_period'
|
/// Cutoff point calculated from the user-supplied 'max_retention_period'
|
||||||
retention_param_cutoff: Option<Lsn>,
|
retention_param_cutoff: Option<Lsn>,
|
||||||
|
|
||||||
|
/// Lease points on the timeline
|
||||||
|
lease_points: Vec<Lsn>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gathers the inputs for the tenant sizing model.
|
/// Gathers the inputs for the tenant sizing model.
|
||||||
@@ -234,6 +256,13 @@ pub(super) async fn gather_inputs(
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let lease_points = gc_info
|
||||||
|
.leases
|
||||||
|
.keys()
|
||||||
|
.filter(|&&lsn| lsn > ancestor_lsn)
|
||||||
|
.copied()
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
|
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
|
||||||
// want to query any logical size before initdb_lsn.
|
// want to query any logical size before initdb_lsn.
|
||||||
let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
|
let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
|
||||||
@@ -248,6 +277,8 @@ pub(super) async fn gather_inputs(
|
|||||||
.map(|lsn| (lsn, LsnKind::BranchPoint))
|
.map(|lsn| (lsn, LsnKind::BranchPoint))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
|
||||||
|
|
||||||
drop(gc_info);
|
drop(gc_info);
|
||||||
|
|
||||||
// Add branch points we collected earlier, just in case there were any that were
|
// Add branch points we collected earlier, just in case there were any that were
|
||||||
@@ -296,6 +327,7 @@ pub(super) async fn gather_inputs(
|
|||||||
if kind == LsnKind::BranchPoint {
|
if kind == LsnKind::BranchPoint {
|
||||||
branchpoint_segments.insert((timeline_id, lsn), segments.len());
|
branchpoint_segments.insert((timeline_id, lsn), segments.len());
|
||||||
}
|
}
|
||||||
|
|
||||||
segments.push(SegmentMeta {
|
segments.push(SegmentMeta {
|
||||||
segment: Segment {
|
segment: Segment {
|
||||||
parent: Some(parent),
|
parent: Some(parent),
|
||||||
@@ -306,7 +338,45 @@ pub(super) async fn gather_inputs(
|
|||||||
timeline_id: timeline.timeline_id,
|
timeline_id: timeline.timeline_id,
|
||||||
kind,
|
kind,
|
||||||
});
|
});
|
||||||
parent += 1;
|
|
||||||
|
parent = segments.len() - 1;
|
||||||
|
|
||||||
|
if kind == LsnKind::LeasePoint {
|
||||||
|
// Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data
|
||||||
|
// (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN
|
||||||
|
// value. Without the other two segments, the calculation code would not count the leased LSN as a point
|
||||||
|
// to be retained.
|
||||||
|
// Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug.
|
||||||
|
//
|
||||||
|
// Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and
|
||||||
|
// branch points can be given a synthetic id so we can unite them.
|
||||||
|
let mut lease_parent = parent;
|
||||||
|
|
||||||
|
// Start of a lease.
|
||||||
|
segments.push(SegmentMeta {
|
||||||
|
segment: Segment {
|
||||||
|
parent: Some(lease_parent),
|
||||||
|
lsn: lsn.0,
|
||||||
|
size: None, // Filled in later, if necessary
|
||||||
|
needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
|
||||||
|
},
|
||||||
|
timeline_id: timeline.timeline_id,
|
||||||
|
kind: LsnKind::LeaseStart,
|
||||||
|
});
|
||||||
|
lease_parent += 1;
|
||||||
|
|
||||||
|
// End of the lease.
|
||||||
|
segments.push(SegmentMeta {
|
||||||
|
segment: Segment {
|
||||||
|
parent: Some(lease_parent),
|
||||||
|
lsn: lsn.0,
|
||||||
|
size: None, // Filled in later, if necessary
|
||||||
|
needed: true, // everything at the lease LSN must be readable => is needed
|
||||||
|
},
|
||||||
|
timeline_id: timeline.timeline_id,
|
||||||
|
kind: LsnKind::LeaseEnd,
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Current end of the timeline
|
// Current end of the timeline
|
||||||
@@ -332,6 +402,7 @@ pub(super) async fn gather_inputs(
|
|||||||
pitr_cutoff,
|
pitr_cutoff,
|
||||||
next_gc_cutoff,
|
next_gc_cutoff,
|
||||||
retention_param_cutoff,
|
retention_param_cutoff,
|
||||||
|
lease_points,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -674,7 +745,8 @@ fn verify_size_for_multiple_branches() {
|
|||||||
"horizon_cutoff": "0/2210CD0",
|
"horizon_cutoff": "0/2210CD0",
|
||||||
"pitr_cutoff": "0/2210CD0",
|
"pitr_cutoff": "0/2210CD0",
|
||||||
"next_gc_cutoff": "0/2210CD0",
|
"next_gc_cutoff": "0/2210CD0",
|
||||||
"retention_param_cutoff": null
|
"retention_param_cutoff": null,
|
||||||
|
"lease_points": []
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"timeline_id": "454626700469f0a9914949b9d018e876",
|
"timeline_id": "454626700469f0a9914949b9d018e876",
|
||||||
@@ -684,7 +756,8 @@ fn verify_size_for_multiple_branches() {
|
|||||||
"horizon_cutoff": "0/1817770",
|
"horizon_cutoff": "0/1817770",
|
||||||
"pitr_cutoff": "0/1817770",
|
"pitr_cutoff": "0/1817770",
|
||||||
"next_gc_cutoff": "0/1817770",
|
"next_gc_cutoff": "0/1817770",
|
||||||
"retention_param_cutoff": null
|
"retention_param_cutoff": null,
|
||||||
|
"lease_points": []
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
|
"timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
|
||||||
@@ -694,7 +767,8 @@ fn verify_size_for_multiple_branches() {
|
|||||||
"horizon_cutoff": "0/18B3D98",
|
"horizon_cutoff": "0/18B3D98",
|
||||||
"pitr_cutoff": "0/18B3D98",
|
"pitr_cutoff": "0/18B3D98",
|
||||||
"next_gc_cutoff": "0/18B3D98",
|
"next_gc_cutoff": "0/18B3D98",
|
||||||
"retention_param_cutoff": null
|
"retention_param_cutoff": null,
|
||||||
|
"lease_points": []
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -749,7 +823,8 @@ fn verify_size_for_one_branch() {
|
|||||||
"horizon_cutoff": "47/240A5860",
|
"horizon_cutoff": "47/240A5860",
|
||||||
"pitr_cutoff": "47/240A5860",
|
"pitr_cutoff": "47/240A5860",
|
||||||
"next_gc_cutoff": "47/240A5860",
|
"next_gc_cutoff": "47/240A5860",
|
||||||
"retention_param_cutoff": "0/0"
|
"retention_param_cutoff": "0/0",
|
||||||
|
"lease_points": []
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}"#;
|
}"#;
|
||||||
|
|||||||
@@ -7,6 +7,9 @@ pub(crate) mod layer;
|
|||||||
mod layer_desc;
|
mod layer_desc;
|
||||||
mod layer_name;
|
mod layer_name;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub mod merge_iterator;
|
||||||
|
|
||||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||||
use crate::repository::Value;
|
use crate::repository::Value;
|
||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
|||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use pageserver_api::keyspace::KeySpace;
|
use pageserver_api::keyspace::KeySpace;
|
||||||
use pageserver_api::models::LayerAccessKind;
|
use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use rand::{distributions::Alphanumeric, Rng};
|
use rand::{distributions::Alphanumeric, Rng};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@@ -223,6 +223,11 @@ pub struct DeltaLayerInner {
|
|||||||
file: VirtualFile,
|
file: VirtualFile,
|
||||||
file_id: FileId,
|
file_id: FileId,
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
layer_key_range: Range<Key>,
|
||||||
|
#[allow(dead_code)]
|
||||||
|
layer_lsn_range: Range<Lsn>,
|
||||||
|
|
||||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -452,7 +457,12 @@ impl DeltaLayerWriterInner {
|
|||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||||
assert!(self.lsn_range.start <= lsn);
|
assert!(self.lsn_range.start <= lsn);
|
||||||
let (val, res) = self.blob_writer.write_blob(val, ctx).await;
|
// We don't want to use compression in delta layer creation
|
||||||
|
let compression = ImageCompressionAlgorithm::Disabled;
|
||||||
|
let (val, res) = self
|
||||||
|
.blob_writer
|
||||||
|
.write_blob_maybe_compressed(val, ctx, compression)
|
||||||
|
.await;
|
||||||
let off = match res {
|
let off = match res {
|
||||||
Ok(off) => off,
|
Ok(off) => off,
|
||||||
Err(e) => return (val, Err(anyhow::anyhow!(e))),
|
Err(e) => return (val, Err(anyhow::anyhow!(e))),
|
||||||
@@ -737,6 +747,16 @@ impl DeltaLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl DeltaLayerInner {
|
impl DeltaLayerInner {
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) fn key_range(&self) -> &Range<Key> {
|
||||||
|
&self.layer_key_range
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
|
||||||
|
&self.layer_lsn_range
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
||||||
/// - inner has the success or transient failure
|
/// - inner has the success or transient failure
|
||||||
/// - outer has the permanent failure
|
/// - outer has the permanent failure
|
||||||
@@ -785,6 +805,8 @@ impl DeltaLayerInner {
|
|||||||
index_start_blk: actual_summary.index_start_blk,
|
index_start_blk: actual_summary.index_start_blk,
|
||||||
index_root_blk: actual_summary.index_root_blk,
|
index_root_blk: actual_summary.index_root_blk,
|
||||||
max_vectored_read_bytes,
|
max_vectored_read_bytes,
|
||||||
|
layer_key_range: actual_summary.key_range,
|
||||||
|
layer_lsn_range: actual_summary.lsn_range,
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1299,7 +1321,7 @@ impl DeltaLayerInner {
|
|||||||
offsets.start.pos(),
|
offsets.start.pos(),
|
||||||
offsets.end.pos(),
|
offsets.end.pos(),
|
||||||
meta,
|
meta,
|
||||||
Some(max_read_size),
|
max_read_size,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -1593,13 +1615,17 @@ impl<'a> DeltaLayerIterator<'a> {
|
|||||||
let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
|
let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
|
||||||
let blob_ref = BlobRef(value);
|
let blob_ref = BlobRef(value);
|
||||||
let offset = blob_ref.pos();
|
let offset = blob_ref.pos();
|
||||||
if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
|
if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
|
||||||
break batch_plan;
|
break batch_plan;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
self.is_end = true;
|
self.is_end = true;
|
||||||
let data_end_offset = self.delta_layer.index_start_offset();
|
let data_end_offset = self.delta_layer.index_start_offset();
|
||||||
break self.planner.handle_range_end(data_end_offset);
|
if let Some(item) = self.planner.handle_range_end(data_end_offset) {
|
||||||
|
break item;
|
||||||
|
} else {
|
||||||
|
return Ok(()); // TODO: test empty iterator
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
|
let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
|
||||||
@@ -1634,7 +1660,7 @@ impl<'a> DeltaLayerIterator<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
pub(crate) mod test {
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
use itertools::MinMaxResult;
|
use itertools::MinMaxResult;
|
||||||
@@ -2212,13 +2238,20 @@ mod test {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn produce_delta_layer(
|
pub(crate) fn sort_delta(
|
||||||
|
(k1, l1, _): &(Key, Lsn, Value),
|
||||||
|
(k2, l2, _): &(Key, Lsn, Value),
|
||||||
|
) -> std::cmp::Ordering {
|
||||||
|
(k1, l1).cmp(&(k2, l2))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn produce_delta_layer(
|
||||||
tenant: &Tenant,
|
tenant: &Tenant,
|
||||||
tline: &Arc<Timeline>,
|
tline: &Arc<Timeline>,
|
||||||
mut deltas: Vec<(Key, Lsn, Value)>,
|
mut deltas: Vec<(Key, Lsn, Value)>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<ResidentLayer> {
|
) -> anyhow::Result<ResidentLayer> {
|
||||||
deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
|
deltas.sort_by(sort_delta);
|
||||||
let (key_start, _, _) = deltas.first().unwrap();
|
let (key_start, _, _) = deltas.first().unwrap();
|
||||||
let (key_max, _, _) = deltas.first().unwrap();
|
let (key_max, _, _) = deltas.first().unwrap();
|
||||||
let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
|
let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
|
||||||
|
|||||||
@@ -369,6 +369,16 @@ impl ImageLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl ImageLayerInner {
|
impl ImageLayerInner {
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) fn key_range(&self) -> &Range<Key> {
|
||||||
|
&self.key_range
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) fn lsn(&self) -> Lsn {
|
||||||
|
self.lsn
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
||||||
/// - inner has the success or transient failure
|
/// - inner has the success or transient failure
|
||||||
/// - outer has the permanent failure
|
/// - outer has the permanent failure
|
||||||
@@ -799,7 +809,11 @@ impl ImageLayerWriterInner {
|
|||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
ensure!(self.key_range.contains(&key));
|
ensure!(self.key_range.contains(&key));
|
||||||
let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
|
let compression = self.conf.image_compression;
|
||||||
|
let (_img, res) = self
|
||||||
|
.blob_writer
|
||||||
|
.write_blob_maybe_compressed(img, ctx, compression)
|
||||||
|
.await;
|
||||||
// TODO: re-use the buffer for `img` further upstack
|
// TODO: re-use the buffer for `img` further upstack
|
||||||
let off = res?;
|
let off = res?;
|
||||||
|
|
||||||
@@ -984,14 +998,17 @@ impl<'a> ImageLayerIterator<'a> {
|
|||||||
Key::from_slice(&raw_key[..KEY_SIZE]),
|
Key::from_slice(&raw_key[..KEY_SIZE]),
|
||||||
self.image_layer.lsn,
|
self.image_layer.lsn,
|
||||||
offset,
|
offset,
|
||||||
BlobFlag::None,
|
|
||||||
) {
|
) {
|
||||||
break batch_plan;
|
break batch_plan;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
self.is_end = true;
|
self.is_end = true;
|
||||||
let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
|
let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
|
||||||
break self.planner.handle_range_end(payload_end);
|
if let Some(item) = self.planner.handle_range_end(payload_end) {
|
||||||
|
break item;
|
||||||
|
} else {
|
||||||
|
return Ok(()); // TODO: a test case on empty iterator
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
|
let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
|
||||||
|
|||||||
@@ -6,13 +6,14 @@
|
|||||||
//!
|
//!
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||||
|
use crate::page_cache::PAGE_SZ;
|
||||||
use crate::repository::{Key, Value};
|
use crate::repository::{Key, Value};
|
||||||
use crate::tenant::block_io::BlockReader;
|
use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
|
||||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||||
use crate::tenant::storage_layer::ValueReconstructResult;
|
use crate::tenant::storage_layer::ValueReconstructResult;
|
||||||
use crate::tenant::timeline::GetVectoredError;
|
use crate::tenant::timeline::GetVectoredError;
|
||||||
use crate::tenant::{PageReconstructError, Timeline};
|
use crate::tenant::{PageReconstructError, Timeline};
|
||||||
use crate::{page_cache, walrecord};
|
use crate::{l0_flush, page_cache, walrecord};
|
||||||
use anyhow::{anyhow, ensure, Result};
|
use anyhow::{anyhow, ensure, Result};
|
||||||
use pageserver_api::keyspace::KeySpace;
|
use pageserver_api::keyspace::KeySpace;
|
||||||
use pageserver_api::models::InMemoryLayerInfo;
|
use pageserver_api::models::InMemoryLayerInfo;
|
||||||
@@ -410,6 +411,7 @@ impl InMemoryLayer {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
|
||||||
let buf = reader.read_blob(block_read.block_offset, &ctx).await;
|
let buf = reader.read_blob(block_read.block_offset, &ctx).await;
|
||||||
if let Err(e) = buf {
|
if let Err(e) = buf {
|
||||||
reconstruct_state
|
reconstruct_state
|
||||||
@@ -620,6 +622,13 @@ impl InMemoryLayer {
|
|||||||
// rare though, so we just accept the potential latency hit for now.
|
// rare though, so we just accept the potential latency hit for now.
|
||||||
let inner = self.inner.read().await;
|
let inner = self.inner.read().await;
|
||||||
|
|
||||||
|
let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
|
||||||
|
use l0_flush::Inner;
|
||||||
|
let _concurrency_permit = match &*l0_flush_global_state {
|
||||||
|
Inner::PageCached => None,
|
||||||
|
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
|
||||||
|
};
|
||||||
|
|
||||||
let end_lsn = *self.end_lsn.get().unwrap();
|
let end_lsn = *self.end_lsn.get().unwrap();
|
||||||
|
|
||||||
let key_count = if let Some(key_range) = key_range {
|
let key_count = if let Some(key_range) = key_range {
|
||||||
@@ -645,28 +654,83 @@ impl InMemoryLayer {
|
|||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
match &*l0_flush_global_state {
|
||||||
|
l0_flush::Inner::PageCached => {
|
||||||
|
let ctx = RequestContextBuilder::extend(ctx)
|
||||||
|
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||||
|
.build();
|
||||||
|
|
||||||
let cursor = inner.file.block_cursor();
|
let mut buf = Vec::new();
|
||||||
|
|
||||||
let ctx = RequestContextBuilder::extend(ctx)
|
let cursor = inner.file.block_cursor();
|
||||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
|
||||||
.build();
|
for (key, vec_map) in inner.index.iter() {
|
||||||
for (key, vec_map) in inner.index.iter() {
|
// Write all page versions
|
||||||
// Write all page versions
|
for (lsn, pos) in vec_map.as_slice() {
|
||||||
for (lsn, pos) in vec_map.as_slice() {
|
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
let will_init = Value::des(&buf)?.will_init();
|
||||||
let will_init = Value::des(&buf)?.will_init();
|
let res;
|
||||||
let res;
|
(buf, res) = delta_layer_writer
|
||||||
(buf, res) = delta_layer_writer
|
.put_value_bytes(*key, *lsn, buf, will_init, &ctx)
|
||||||
.put_value_bytes(*key, *lsn, buf, will_init, &ctx)
|
.await;
|
||||||
.await;
|
res?;
|
||||||
res?;
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
l0_flush::Inner::Direct { .. } => {
|
||||||
|
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
|
||||||
|
assert_eq!(
|
||||||
|
file_contents.len() % PAGE_SZ,
|
||||||
|
0,
|
||||||
|
"needed by BlockReaderRef::Slice"
|
||||||
|
);
|
||||||
|
assert_eq!(file_contents.len(), {
|
||||||
|
let written = usize::try_from(inner.file.len()).unwrap();
|
||||||
|
if written % PAGE_SZ == 0 {
|
||||||
|
written
|
||||||
|
} else {
|
||||||
|
written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
|
||||||
|
for (key, vec_map) in inner.index.iter() {
|
||||||
|
// Write all page versions
|
||||||
|
for (lsn, pos) in vec_map.as_slice() {
|
||||||
|
// TODO: once we have blob lengths in the in-memory index, we can
|
||||||
|
// 1. get rid of the blob_io / BlockReaderRef::Slice business and
|
||||||
|
// 2. load the file contents into a Bytes and
|
||||||
|
// 3. the use `Bytes::slice` to get the `buf` that is our blob
|
||||||
|
// 4. pass that `buf` into `put_value_bytes`
|
||||||
|
// => https://github.com/neondatabase/neon/issues/8183
|
||||||
|
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
|
||||||
|
let will_init = Value::des(&buf)?.will_init();
|
||||||
|
let res;
|
||||||
|
(buf, res) = delta_layer_writer
|
||||||
|
.put_value_bytes(*key, *lsn, buf, will_init, ctx)
|
||||||
|
.await;
|
||||||
|
res?;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// MAX is used here because we identify L0 layers by full key range
|
// MAX is used here because we identify L0 layers by full key range
|
||||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
|
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
|
||||||
|
|
||||||
|
// Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
|
||||||
|
//
|
||||||
|
// If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
|
||||||
|
// the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
|
||||||
|
// Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
|
||||||
|
//
|
||||||
|
// We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
|
||||||
|
// we dirtied when writing to the filesystem have been flushed and marked !dirty.
|
||||||
|
drop(_concurrency_permit);
|
||||||
|
|
||||||
Ok(Some(delta_layer))
|
Ok(Some(delta_layer))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1096,19 +1096,10 @@ impl LayerInner {
|
|||||||
|
|
||||||
match rx.await {
|
match rx.await {
|
||||||
Ok(Ok(res)) => Ok(res),
|
Ok(Ok(res)) => Ok(res),
|
||||||
Ok(Err(e)) => {
|
Ok(Err(remote_storage::DownloadError::Cancelled)) => {
|
||||||
// sleep already happened in the spawned task, if it was not cancelled
|
Err(DownloadError::DownloadCancelled)
|
||||||
match e.downcast_ref::<remote_storage::DownloadError>() {
|
|
||||||
// If the download failed due to its cancellation token,
|
|
||||||
// propagate the cancellation error upstream.
|
|
||||||
Some(remote_storage::DownloadError::Cancelled) => {
|
|
||||||
Err(DownloadError::DownloadCancelled)
|
|
||||||
}
|
|
||||||
// FIXME: this is not embedding the error because historically it would had
|
|
||||||
// been output to compute, however that is no longer the case.
|
|
||||||
_ => Err(DownloadError::DownloadFailed),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
Ok(Err(_)) => Err(DownloadError::DownloadFailed),
|
||||||
Err(_gone) => Err(DownloadError::DownloadCancelled),
|
Err(_gone) => Err(DownloadError::DownloadCancelled),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1118,7 +1109,7 @@ impl LayerInner {
|
|||||||
timeline: Arc<Timeline>,
|
timeline: Arc<Timeline>,
|
||||||
permit: heavier_once_cell::InitPermit,
|
permit: heavier_once_cell::InitPermit,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Arc<DownloadedLayer>> {
|
) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
|
||||||
let result = timeline
|
let result = timeline
|
||||||
.remote_client
|
.remote_client
|
||||||
.download_layer_file(
|
.download_layer_file(
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ pub struct PersistentLayerDesc {
|
|||||||
///
|
///
|
||||||
/// - For an open in-memory layer, the end bound is MAX_LSN
|
/// - For an open in-memory layer, the end bound is MAX_LSN
|
||||||
/// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
|
/// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
|
||||||
/// range start
|
/// range start
|
||||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||||
pub lsn_range: Range<Lsn>,
|
pub lsn_range: Range<Lsn>,
|
||||||
/// Whether this is a delta layer, and also, is this incremental.
|
/// Whether this is a delta layer, and also, is this incremental.
|
||||||
|
|||||||
412
pageserver/src/tenant/storage_layer/merge_iterator.rs
Normal file
412
pageserver/src/tenant/storage_layer/merge_iterator.rs
Normal file
@@ -0,0 +1,412 @@
|
|||||||
|
use std::{
|
||||||
|
cmp::Ordering,
|
||||||
|
collections::{binary_heap, BinaryHeap},
|
||||||
|
};
|
||||||
|
|
||||||
|
use pageserver_api::key::Key;
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
use crate::{context::RequestContext, repository::Value};
|
||||||
|
|
||||||
|
use super::{
|
||||||
|
delta_layer::{DeltaLayerInner, DeltaLayerIterator},
|
||||||
|
image_layer::{ImageLayerInner, ImageLayerIterator},
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
enum LayerRef<'a> {
|
||||||
|
Image(&'a ImageLayerInner),
|
||||||
|
Delta(&'a DeltaLayerInner),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> LayerRef<'a> {
|
||||||
|
fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
|
||||||
|
match self {
|
||||||
|
Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
|
||||||
|
Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enum LayerIterRef<'a> {
|
||||||
|
Image(ImageLayerIterator<'a>),
|
||||||
|
Delta(DeltaLayerIterator<'a>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LayerIterRef<'_> {
|
||||||
|
async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||||
|
match self {
|
||||||
|
Self::Delta(x) => x.next().await,
|
||||||
|
Self::Image(x) => x.next().await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This type plays several roles at once
|
||||||
|
/// 1. Unified iterator for image and delta layers.
|
||||||
|
/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
|
||||||
|
/// 3. Lazy creation of the real delta/image iterator.
|
||||||
|
enum IteratorWrapper<'a> {
|
||||||
|
NotLoaded {
|
||||||
|
ctx: &'a RequestContext,
|
||||||
|
first_key_lower_bound: (Key, Lsn),
|
||||||
|
layer: LayerRef<'a>,
|
||||||
|
},
|
||||||
|
Loaded {
|
||||||
|
iter: PeekableLayerIterRef<'a>,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PeekableLayerIterRef<'a> {
|
||||||
|
iter: LayerIterRef<'a>,
|
||||||
|
peeked: Option<(Key, Lsn, Value)>, // None == end
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> PeekableLayerIterRef<'a> {
|
||||||
|
async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result<Self> {
|
||||||
|
let peeked = iter.next().await?;
|
||||||
|
Ok(Self { iter, peeked })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn peek(&self) -> &Option<(Key, Lsn, Value)> {
|
||||||
|
&self.peeked
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||||
|
let result = self.peeked.take();
|
||||||
|
self.peeked = self.iter.next().await?;
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.cmp(other) == Ordering::Equal
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
|
||||||
|
|
||||||
|
impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
|
||||||
|
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
let a = self.peek_next_key_lsn();
|
||||||
|
let b = other.peek_next_key_lsn();
|
||||||
|
match (a, b) {
|
||||||
|
(Some((k1, l1)), Some((k2, l2))) => {
|
||||||
|
let loaded_1 = if self.is_loaded() { 1 } else { 0 };
|
||||||
|
let loaded_2 = if other.is_loaded() { 1 } else { 0 };
|
||||||
|
// When key_lsn are the same, the unloaded iter will always appear before the loaded one.
|
||||||
|
// And note that we do a reverse at the end of the comparison, so it works with the max heap.
|
||||||
|
(k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
|
||||||
|
}
|
||||||
|
(Some(_), None) => Ordering::Less,
|
||||||
|
(None, Some(_)) => Ordering::Greater,
|
||||||
|
(None, None) => Ordering::Equal,
|
||||||
|
}
|
||||||
|
.reverse()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> IteratorWrapper<'a> {
|
||||||
|
pub fn create_from_image_layer(
|
||||||
|
image_layer: &'a ImageLayerInner,
|
||||||
|
ctx: &'a RequestContext,
|
||||||
|
) -> Self {
|
||||||
|
Self::NotLoaded {
|
||||||
|
layer: LayerRef::Image(image_layer),
|
||||||
|
first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
|
||||||
|
ctx,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn create_from_delta_layer(
|
||||||
|
delta_layer: &'a DeltaLayerInner,
|
||||||
|
ctx: &'a RequestContext,
|
||||||
|
) -> Self {
|
||||||
|
Self::NotLoaded {
|
||||||
|
layer: LayerRef::Delta(delta_layer),
|
||||||
|
first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
|
||||||
|
ctx,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
|
||||||
|
match self {
|
||||||
|
Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
|
||||||
|
Self::NotLoaded {
|
||||||
|
first_key_lower_bound: (key, lsn),
|
||||||
|
..
|
||||||
|
} => Some((key, *lsn)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// CORRECTNESS: this function must always take `&mut self`, never `&self`.
|
||||||
|
//
|
||||||
|
// The reason is that `impl Ord for Self` evaluates differently after this function
|
||||||
|
// returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when
|
||||||
|
// the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut`
|
||||||
|
// and not just `PeekMut::deref`
|
||||||
|
// If we don't take `&mut self`
|
||||||
|
async fn load(&mut self) -> anyhow::Result<()> {
|
||||||
|
assert!(!self.is_loaded());
|
||||||
|
let Self::NotLoaded {
|
||||||
|
ctx,
|
||||||
|
first_key_lower_bound,
|
||||||
|
layer,
|
||||||
|
} = self
|
||||||
|
else {
|
||||||
|
unreachable!()
|
||||||
|
};
|
||||||
|
let iter = layer.iter(ctx);
|
||||||
|
let iter = PeekableLayerIterRef::create(iter).await?;
|
||||||
|
if let Some((k1, l1, _)) = iter.peek() {
|
||||||
|
let (k2, l2) = first_key_lower_bound;
|
||||||
|
debug_assert!((k1, l1) >= (k2, l2));
|
||||||
|
}
|
||||||
|
*self = Self::Loaded { iter };
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_loaded(&self) -> bool {
|
||||||
|
matches!(self, Self::Loaded { .. })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Correctness: must load the iterator before using.
|
||||||
|
///
|
||||||
|
/// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it.
|
||||||
|
/// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
|
||||||
|
/// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
|
||||||
|
async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||||
|
let Self::Loaded { iter } = self else {
|
||||||
|
panic!("must load the iterator before using")
|
||||||
|
};
|
||||||
|
iter.next().await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct MergeIterator<'a> {
|
||||||
|
heap: BinaryHeap<IteratorWrapper<'a>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> MergeIterator<'a> {
|
||||||
|
pub fn create(
|
||||||
|
deltas: &[&'a DeltaLayerInner],
|
||||||
|
images: &[&'a ImageLayerInner],
|
||||||
|
ctx: &'a RequestContext,
|
||||||
|
) -> Self {
|
||||||
|
let mut heap = Vec::with_capacity(images.len() + deltas.len());
|
||||||
|
for image in images {
|
||||||
|
heap.push(IteratorWrapper::create_from_image_layer(image, ctx));
|
||||||
|
}
|
||||||
|
for delta in deltas {
|
||||||
|
heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx));
|
||||||
|
}
|
||||||
|
Self {
|
||||||
|
heap: BinaryHeap::from(heap),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||||
|
while let Some(mut iter) = self.heap.peek_mut() {
|
||||||
|
if !iter.is_loaded() {
|
||||||
|
// Once we load the iterator, we can know the real first key-value pair in the iterator.
|
||||||
|
// We put it back into the heap so that a potentially unloaded layer may have a key between
|
||||||
|
// [potential_first_key, loaded_first_key).
|
||||||
|
iter.load().await?;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let Some(item) = iter.next().await? else {
|
||||||
|
// If the iterator returns None, we pop this iterator. Actually, in the current implementation,
|
||||||
|
// we order None > Some, and all the rest of the iterators should return None.
|
||||||
|
binary_heap::PeekMut::pop(iter);
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
return Ok(Some(item));
|
||||||
|
}
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
use itertools::Itertools;
|
||||||
|
use pageserver_api::key::Key;
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
tenant::{
|
||||||
|
harness::{TenantHarness, TIMELINE_ID},
|
||||||
|
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
|
||||||
|
},
|
||||||
|
DEFAULT_PG_VERSION,
|
||||||
|
};
|
||||||
|
|
||||||
|
async fn assert_merge_iter_equal(
|
||||||
|
merge_iter: &mut MergeIterator<'_>,
|
||||||
|
expect: &[(Key, Lsn, Value)],
|
||||||
|
) {
|
||||||
|
let mut expect_iter = expect.iter();
|
||||||
|
loop {
|
||||||
|
let o1 = merge_iter.next().await.unwrap();
|
||||||
|
let o2 = expect_iter.next();
|
||||||
|
assert_eq!(o1.is_some(), o2.is_some());
|
||||||
|
if o1.is_none() && o2.is_none() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let (k1, l1, v1) = o1.unwrap();
|
||||||
|
let (k2, l2, v2) = o2.unwrap();
|
||||||
|
assert_eq!(&k1, k2);
|
||||||
|
assert_eq!(l1, *l2);
|
||||||
|
assert_eq!(&v1, v2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn merge_in_between() {
|
||||||
|
use crate::repository::Value;
|
||||||
|
use bytes::Bytes;
|
||||||
|
|
||||||
|
let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap();
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
|
let tline = tenant
|
||||||
|
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
fn get_key(id: u32) -> Key {
|
||||||
|
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||||
|
key.field6 = id;
|
||||||
|
key
|
||||||
|
}
|
||||||
|
let test_deltas1 = vec![
|
||||||
|
(
|
||||||
|
get_key(0),
|
||||||
|
Lsn(0x10),
|
||||||
|
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(5),
|
||||||
|
Lsn(0x10),
|
||||||
|
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let test_deltas2 = vec![
|
||||||
|
(
|
||||||
|
get_key(3),
|
||||||
|
Lsn(0x10),
|
||||||
|
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(4),
|
||||||
|
Lsn(0x10),
|
||||||
|
Value::Image(Bytes::copy_from_slice(b"test")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let mut merge_iter = MergeIterator::create(
|
||||||
|
&[
|
||||||
|
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
|
||||||
|
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||||
|
],
|
||||||
|
&[],
|
||||||
|
&ctx,
|
||||||
|
);
|
||||||
|
let mut expect = Vec::new();
|
||||||
|
expect.extend(test_deltas1);
|
||||||
|
expect.extend(test_deltas2);
|
||||||
|
expect.sort_by(sort_delta);
|
||||||
|
assert_merge_iter_equal(&mut merge_iter, &expect).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn delta_merge() {
|
||||||
|
use crate::repository::Value;
|
||||||
|
use bytes::Bytes;
|
||||||
|
|
||||||
|
let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
|
let tline = tenant
|
||||||
|
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
fn get_key(id: u32) -> Key {
|
||||||
|
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||||
|
key.field6 = id;
|
||||||
|
key
|
||||||
|
}
|
||||||
|
const N: usize = 1000;
|
||||||
|
let test_deltas1 = (0..N)
|
||||||
|
.map(|idx| {
|
||||||
|
(
|
||||||
|
get_key(idx as u32 / 10),
|
||||||
|
Lsn(0x20 * ((idx as u64) % 10 + 1)),
|
||||||
|
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect_vec();
|
||||||
|
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let test_deltas2 = (0..N)
|
||||||
|
.map(|idx| {
|
||||||
|
(
|
||||||
|
get_key(idx as u32 / 10),
|
||||||
|
Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10),
|
||||||
|
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect_vec();
|
||||||
|
let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let test_deltas3 = (0..N)
|
||||||
|
.map(|idx| {
|
||||||
|
(
|
||||||
|
get_key(idx as u32 / 10 + N as u32),
|
||||||
|
Lsn(0x10 * ((idx as u64) % 10 + 1)),
|
||||||
|
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect_vec();
|
||||||
|
let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let mut merge_iter = MergeIterator::create(
|
||||||
|
&[
|
||||||
|
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||||
|
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
|
||||||
|
resident_layer_3.get_as_delta(&ctx).await.unwrap(),
|
||||||
|
],
|
||||||
|
&[],
|
||||||
|
&ctx,
|
||||||
|
);
|
||||||
|
let mut expect = Vec::new();
|
||||||
|
expect.extend(test_deltas1);
|
||||||
|
expect.extend(test_deltas2);
|
||||||
|
expect.extend(test_deltas3);
|
||||||
|
expect.sort_by(sort_delta);
|
||||||
|
assert_merge_iter_equal(&mut merge_iter, &expect).await;
|
||||||
|
|
||||||
|
// TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: image layer merge, delta+image mixed merge
|
||||||
|
// TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
|
||||||
|
}
|
||||||
@@ -14,6 +14,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
|||||||
use arc_swap::ArcSwap;
|
use arc_swap::ArcSwap;
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
use enumset::EnumSet;
|
use enumset::EnumSet;
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
@@ -65,13 +66,12 @@ use std::{
|
|||||||
ops::{Deref, Range},
|
ops::{Deref, Range},
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::metrics::GetKind;
|
|
||||||
use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
|
|
||||||
use crate::{
|
use crate::{
|
||||||
aux_file::AuxFileSizeEstimator,
|
aux_file::AuxFileSizeEstimator,
|
||||||
tenant::{
|
tenant::{
|
||||||
layer_map::{LayerMap, SearchResult},
|
layer_map::{LayerMap, SearchResult},
|
||||||
metadata::TimelineMetadata,
|
metadata::TimelineMetadata,
|
||||||
|
storage_layer::PersistentLayerDesc,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -90,10 +90,15 @@ use crate::{
|
|||||||
use crate::{
|
use crate::{
|
||||||
disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
|
disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
|
||||||
};
|
};
|
||||||
|
use crate::{
|
||||||
|
l0_flush::{self, L0FlushGlobalState},
|
||||||
|
metrics::GetKind,
|
||||||
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
|
metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
|
||||||
};
|
};
|
||||||
use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
|
use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
|
||||||
|
use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
|
||||||
use crate::{
|
use crate::{
|
||||||
pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
|
pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
|
||||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||||
@@ -208,6 +213,7 @@ pub struct TimelineResources {
|
|||||||
pub timeline_get_throttle: Arc<
|
pub timeline_get_throttle: Arc<
|
||||||
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
|
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
|
||||||
>,
|
>,
|
||||||
|
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct AuxFilesState {
|
pub(crate) struct AuxFilesState {
|
||||||
@@ -360,6 +366,7 @@ pub struct Timeline {
|
|||||||
repartition_threshold: u64,
|
repartition_threshold: u64,
|
||||||
|
|
||||||
last_image_layer_creation_check_at: AtomicLsn,
|
last_image_layer_creation_check_at: AtomicLsn,
|
||||||
|
last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
|
||||||
|
|
||||||
/// Current logical size of the "datadir", at the last LSN.
|
/// Current logical size of the "datadir", at the last LSN.
|
||||||
current_logical_size: LogicalSize,
|
current_logical_size: LogicalSize,
|
||||||
@@ -433,6 +440,8 @@ pub struct Timeline {
|
|||||||
/// in the future, add `extra_test_sparse_keyspace` if necessary.
|
/// in the future, add `extra_test_sparse_keyspace` if necessary.
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
|
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
|
||||||
|
|
||||||
|
pub(crate) l0_flush_global_state: L0FlushGlobalState,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct WalReceiverInfo {
|
pub struct WalReceiverInfo {
|
||||||
@@ -457,6 +466,9 @@ pub(crate) struct GcInfo {
|
|||||||
|
|
||||||
/// Leases granted to particular LSNs.
|
/// Leases granted to particular LSNs.
|
||||||
pub(crate) leases: BTreeMap<Lsn, LsnLease>,
|
pub(crate) leases: BTreeMap<Lsn, LsnLease>,
|
||||||
|
|
||||||
|
/// Whether our branch point is within our ancestor's PITR interval (for cost estimation)
|
||||||
|
pub(crate) within_ancestor_pitr: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl GcInfo {
|
impl GcInfo {
|
||||||
@@ -717,6 +729,9 @@ impl From<CreateImageLayersError> for CompactionError {
|
|||||||
fn from(e: CreateImageLayersError) -> Self {
|
fn from(e: CreateImageLayersError) -> Self {
|
||||||
match e {
|
match e {
|
||||||
CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
|
CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
|
||||||
|
CreateImageLayersError::Other(e) => {
|
||||||
|
CompactionError::Other(e.context("create image layers"))
|
||||||
|
}
|
||||||
_ => CompactionError::Other(e.into()),
|
_ => CompactionError::Other(e.into()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -845,6 +860,18 @@ impl Timeline {
|
|||||||
.map(|ancestor| ancestor.timeline_id)
|
.map(|ancestor| ancestor.timeline_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the bytes written since the PITR cutoff on this branch, and
|
||||||
|
/// whether this branch's ancestor_lsn is within its parent's PITR.
|
||||||
|
pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
|
||||||
|
let gc_info = self.gc_info.read().unwrap();
|
||||||
|
let history = self
|
||||||
|
.get_last_record_lsn()
|
||||||
|
.checked_sub(gc_info.cutoffs.pitr)
|
||||||
|
.unwrap_or(Lsn(0))
|
||||||
|
.0;
|
||||||
|
(history, gc_info.within_ancestor_pitr)
|
||||||
|
}
|
||||||
|
|
||||||
/// Lock and get timeline's GC cutoff
|
/// Lock and get timeline's GC cutoff
|
||||||
pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
|
pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
|
||||||
self.latest_gc_cutoff_lsn.read()
|
self.latest_gc_cutoff_lsn.read()
|
||||||
@@ -996,6 +1023,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
|
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
|
||||||
|
pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
|
||||||
|
|
||||||
/// Look up multiple page versions at a given LSN
|
/// Look up multiple page versions at a given LSN
|
||||||
///
|
///
|
||||||
@@ -1228,7 +1256,7 @@ impl Timeline {
|
|||||||
let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
|
let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
|
||||||
.for_get_kind(get_kind)
|
.for_get_kind(get_kind)
|
||||||
.start_timer();
|
.start_timer();
|
||||||
self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
|
self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
get_data_timer.stop_and_record();
|
get_data_timer.stop_and_record();
|
||||||
|
|
||||||
@@ -1258,11 +1286,25 @@ impl Timeline {
|
|||||||
// (this is a requirement, not a bug). Skip updating the metric in these cases
|
// (this is a requirement, not a bug). Skip updating the metric in these cases
|
||||||
// to avoid infinite results.
|
// to avoid infinite results.
|
||||||
if !results.is_empty() {
|
if !results.is_empty() {
|
||||||
|
let avg = layers_visited as f64 / results.len() as f64;
|
||||||
|
if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
|
||||||
|
use utils::rate_limit::RateLimit;
|
||||||
|
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||||
|
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
|
||||||
|
let mut rate_limit = LOGGED.lock().unwrap();
|
||||||
|
rate_limit.call(|| {
|
||||||
|
tracing::info!(
|
||||||
|
shard_id = %self.tenant_shard_id.shard_slug(),
|
||||||
|
lsn = %lsn,
|
||||||
|
"Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
|
||||||
|
keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Note that this is an approximation. Tracking the exact number of layers visited
|
// Note that this is an approximation. Tracking the exact number of layers visited
|
||||||
// per key requires virtually unbounded memory usage and is inefficient
|
// per key requires virtually unbounded memory usage and is inefficient
|
||||||
// (i.e. segment tree tracking each range queried from a layer)
|
// (i.e. segment tree tracking each range queried from a layer)
|
||||||
crate::metrics::VEC_READ_NUM_LAYERS_VISITED
|
crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
|
||||||
.observe(layers_visited as f64 / results.len() as f64);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(results)
|
Ok(results)
|
||||||
@@ -1554,7 +1596,13 @@ impl Timeline {
|
|||||||
let existing_lease = occupied.get_mut();
|
let existing_lease = occupied.get_mut();
|
||||||
if valid_until > existing_lease.valid_until {
|
if valid_until > existing_lease.valid_until {
|
||||||
existing_lease.valid_until = valid_until;
|
existing_lease.valid_until = valid_until;
|
||||||
|
let dt: DateTime<Utc> = valid_until.into();
|
||||||
|
info!("lease extended to {}", dt);
|
||||||
|
} else {
|
||||||
|
let dt: DateTime<Utc> = existing_lease.valid_until.into();
|
||||||
|
info!("existing lease covers greater length, valid until {}", dt);
|
||||||
}
|
}
|
||||||
|
|
||||||
existing_lease.clone()
|
existing_lease.clone()
|
||||||
} else {
|
} else {
|
||||||
// Reject already GC-ed LSN (lsn < latest_gc_cutoff)
|
// Reject already GC-ed LSN (lsn < latest_gc_cutoff)
|
||||||
@@ -1563,6 +1611,8 @@ impl Timeline {
|
|||||||
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
|
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let dt: DateTime<Utc> = valid_until.into();
|
||||||
|
info!("lease created, valid until {}", dt);
|
||||||
entry.or_insert(LsnLease { valid_until }).clone()
|
entry.or_insert(LsnLease { valid_until }).clone()
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -2339,6 +2389,7 @@ impl Timeline {
|
|||||||
)),
|
)),
|
||||||
repartition_threshold: 0,
|
repartition_threshold: 0,
|
||||||
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||||
|
last_image_layer_creation_check_instant: Mutex::new(None),
|
||||||
|
|
||||||
last_received_wal: Mutex::new(None),
|
last_received_wal: Mutex::new(None),
|
||||||
rel_size_cache: RwLock::new(RelSizeCache {
|
rel_size_cache: RwLock::new(RelSizeCache {
|
||||||
@@ -2376,6 +2427,8 @@ impl Timeline {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
|
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
|
||||||
|
|
||||||
|
l0_flush_global_state: resources.l0_flush_global_state,
|
||||||
};
|
};
|
||||||
result.repartition_threshold =
|
result.repartition_threshold =
|
||||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||||
@@ -3355,6 +3408,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::doc_lazy_continuation)]
|
||||||
/// Get the data needed to reconstruct all keys in the provided keyspace
|
/// Get the data needed to reconstruct all keys in the provided keyspace
|
||||||
///
|
///
|
||||||
/// The algorithm is as follows:
|
/// The algorithm is as follows:
|
||||||
@@ -4417,6 +4471,58 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Predicate function which indicates whether we should check if new image layers
|
||||||
|
/// are required. Since checking if new image layers are required is expensive in
|
||||||
|
/// terms of CPU, we only do it in the following cases:
|
||||||
|
/// 1. If the timeline has ingested sufficient WAL to justify the cost
|
||||||
|
/// 2. If enough time has passed since the last check:
|
||||||
|
/// 1. For large tenants, we wish to perform the check more often since they
|
||||||
|
/// suffer from the lack of image layers
|
||||||
|
/// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval
|
||||||
|
fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
|
||||||
|
const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
|
||||||
|
|
||||||
|
let last_checks_at = self.last_image_layer_creation_check_at.load();
|
||||||
|
let distance = lsn
|
||||||
|
.checked_sub(last_checks_at)
|
||||||
|
.expect("Attempt to compact with LSN going backwards");
|
||||||
|
let min_distance =
|
||||||
|
self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance();
|
||||||
|
|
||||||
|
let distance_based_decision = distance.0 >= min_distance;
|
||||||
|
|
||||||
|
let mut time_based_decision = false;
|
||||||
|
let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
|
||||||
|
if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
|
||||||
|
let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
|
||||||
|
{
|
||||||
|
self.get_checkpoint_timeout()
|
||||||
|
} else {
|
||||||
|
Duration::from_secs(3600 * 48)
|
||||||
|
};
|
||||||
|
|
||||||
|
time_based_decision = match *last_check_instant {
|
||||||
|
Some(last_check) => {
|
||||||
|
let elapsed = last_check.elapsed();
|
||||||
|
elapsed >= check_required_after
|
||||||
|
}
|
||||||
|
None => true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do the expensive delta layer counting only if this timeline has ingested sufficient
|
||||||
|
// WAL since the last check or a checkpoint timeout interval has elapsed since the last
|
||||||
|
// check.
|
||||||
|
let decision = distance_based_decision || time_based_decision;
|
||||||
|
|
||||||
|
if decision {
|
||||||
|
self.last_image_layer_creation_check_at.store(lsn);
|
||||||
|
*last_check_instant = Some(Instant::now());
|
||||||
|
}
|
||||||
|
|
||||||
|
decision
|
||||||
|
}
|
||||||
|
|
||||||
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
|
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
|
||||||
async fn create_image_layers(
|
async fn create_image_layers(
|
||||||
self: &Arc<Timeline>,
|
self: &Arc<Timeline>,
|
||||||
@@ -4439,22 +4545,7 @@ impl Timeline {
|
|||||||
// image layers <100000000..100000099> and <200000000..200000199> are not completely covering it.
|
// image layers <100000000..100000099> and <200000000..200000199> are not completely covering it.
|
||||||
let mut start = Key::MIN;
|
let mut start = Key::MIN;
|
||||||
|
|
||||||
let check_for_image_layers = {
|
let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
|
||||||
let last_checks_at = self.last_image_layer_creation_check_at.load();
|
|
||||||
let distance = lsn
|
|
||||||
.checked_sub(last_checks_at)
|
|
||||||
.expect("Attempt to compact with LSN going backwards");
|
|
||||||
let min_distance = self.get_image_layer_creation_check_threshold() as u64
|
|
||||||
* self.get_checkpoint_distance();
|
|
||||||
|
|
||||||
// Skip the expensive delta layer counting if this timeline has not ingested sufficient
|
|
||||||
// WAL since the last check.
|
|
||||||
distance.0 >= min_distance
|
|
||||||
};
|
|
||||||
|
|
||||||
if check_for_image_layers {
|
|
||||||
self.last_image_layer_creation_check_at.store(lsn);
|
|
||||||
}
|
|
||||||
|
|
||||||
for partition in partitioning.parts.iter() {
|
for partition in partitioning.parts.iter() {
|
||||||
let img_range = start..partition.ranges.last().unwrap().end;
|
let img_range = start..partition.ranges.last().unwrap().end;
|
||||||
@@ -4483,6 +4574,22 @@ impl Timeline {
|
|||||||
start = img_range.end;
|
start = img_range.end;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
} else if let ImageLayerCreationMode::Force = mode {
|
||||||
|
// When forced to create image layers, we might try and create them where they already
|
||||||
|
// exist. This mode is only used in tests/debug.
|
||||||
|
let layers = self.layers.read().await;
|
||||||
|
if layers.contains_key(&PersistentLayerKey {
|
||||||
|
key_range: img_range.clone(),
|
||||||
|
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
|
||||||
|
is_delta: false,
|
||||||
|
}) {
|
||||||
|
tracing::info!(
|
||||||
|
"Skipping image layer at {lsn} {}..{}, already exists",
|
||||||
|
img_range.start,
|
||||||
|
img_range.end
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let image_layer_writer = ImageLayerWriter::new(
|
let image_layer_writer = ImageLayerWriter::new(
|
||||||
@@ -4613,7 +4720,7 @@ impl Timeline {
|
|||||||
/// Requires a timeline that:
|
/// Requires a timeline that:
|
||||||
/// - has an ancestor to detach from
|
/// - has an ancestor to detach from
|
||||||
/// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
|
/// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
|
||||||
/// a technical requirement
|
/// a technical requirement
|
||||||
///
|
///
|
||||||
/// After the operation has been started, it cannot be canceled. Upon restart it needs to be
|
/// After the operation has been started, it cannot be canceled. Upon restart it needs to be
|
||||||
/// polled again until completion.
|
/// polled again until completion.
|
||||||
@@ -4711,6 +4818,42 @@ impl DurationRecorder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the
|
||||||
|
/// delta layer might be different from the min/max key/lsn in the delta layer. Therefore,
|
||||||
|
/// the layer descriptor requires the user to provide the ranges, which should cover all
|
||||||
|
/// keys specified in the `data` field.
|
||||||
|
#[cfg(test)]
|
||||||
|
pub struct DeltaLayerTestDesc {
|
||||||
|
pub lsn_range: Range<Lsn>,
|
||||||
|
pub key_range: Range<Key>,
|
||||||
|
pub data: Vec<(Key, Lsn, Value)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
impl DeltaLayerTestDesc {
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
|
||||||
|
Self {
|
||||||
|
lsn_range,
|
||||||
|
key_range,
|
||||||
|
data,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new_with_inferred_key_range(
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
|
data: Vec<(Key, Lsn, Value)>,
|
||||||
|
) -> Self {
|
||||||
|
let key_min = data.iter().map(|(key, _, _)| key).min().unwrap();
|
||||||
|
let key_max = data.iter().map(|(key, _, _)| key).max().unwrap();
|
||||||
|
Self {
|
||||||
|
key_range: (*key_min)..(key_max.next()),
|
||||||
|
lsn_range,
|
||||||
|
data,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Timeline {
|
impl Timeline {
|
||||||
async fn finish_compact_batch(
|
async fn finish_compact_batch(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
@@ -5511,37 +5654,65 @@ impl Timeline {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(super) async fn force_create_delta_layer(
|
pub(super) async fn force_create_delta_layer(
|
||||||
self: &Arc<Timeline>,
|
self: &Arc<Timeline>,
|
||||||
mut deltas: Vec<(Key, Lsn, Value)>,
|
mut deltas: DeltaLayerTestDesc,
|
||||||
check_start_lsn: Option<Lsn>,
|
check_start_lsn: Option<Lsn>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let last_record_lsn = self.get_last_record_lsn();
|
let last_record_lsn = self.get_last_record_lsn();
|
||||||
deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
|
deltas
|
||||||
let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
|
.data
|
||||||
let end_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
|
.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
|
||||||
let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
|
assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start);
|
||||||
let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
|
assert!(deltas.data.last().unwrap().0 < deltas.key_range.end);
|
||||||
|
for (_, lsn, _) in &deltas.data {
|
||||||
|
assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end);
|
||||||
|
}
|
||||||
assert!(
|
assert!(
|
||||||
max_lsn <= last_record_lsn,
|
deltas.lsn_range.end <= last_record_lsn,
|
||||||
"advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
|
"advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
|
||||||
|
deltas.lsn_range.end,
|
||||||
|
last_record_lsn
|
||||||
);
|
);
|
||||||
let end_lsn = Lsn(max_lsn.0 + 1);
|
|
||||||
if let Some(check_start_lsn) = check_start_lsn {
|
if let Some(check_start_lsn) = check_start_lsn {
|
||||||
assert!(min_lsn >= check_start_lsn);
|
assert!(deltas.lsn_range.start >= check_start_lsn);
|
||||||
|
}
|
||||||
|
// check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
|
||||||
|
// layers of the same start/end LSN, and so should the force inserted layer
|
||||||
|
{
|
||||||
|
/// Checks if a overlaps with b, assume a/b = [start, end).
|
||||||
|
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||||
|
!(a.end <= b.start || b.end <= a.start)
|
||||||
|
}
|
||||||
|
|
||||||
|
let guard = self.layers.read().await;
|
||||||
|
for layer in guard.layer_map().iter_historic_layers() {
|
||||||
|
if layer.is_delta()
|
||||||
|
&& overlaps_with(&layer.lsn_range, &deltas.lsn_range)
|
||||||
|
&& layer.lsn_range != deltas.lsn_range
|
||||||
|
{
|
||||||
|
// If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
|
||||||
|
panic!(
|
||||||
|
"inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
|
||||||
|
deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||||
self.conf,
|
self.conf,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.tenant_shard_id,
|
self.tenant_shard_id,
|
||||||
min_key,
|
deltas.key_range.start,
|
||||||
min_lsn..end_lsn,
|
deltas.lsn_range,
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
for (key, lsn, val) in deltas {
|
for (key, lsn, val) in deltas.data {
|
||||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||||
}
|
}
|
||||||
let delta_layer = delta_layer_writer.finish(end_key, self, ctx).await?;
|
let delta_layer = delta_layer_writer
|
||||||
|
.finish(deltas.key_range.end, self, ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
|
|||||||
@@ -182,13 +182,15 @@ async fn remove_timeline_from_tenant(
|
|||||||
/// 5. Delete index part
|
/// 5. Delete index part
|
||||||
/// 6. Delete meta, timeline directory
|
/// 6. Delete meta, timeline directory
|
||||||
/// 7. Delete mark file
|
/// 7. Delete mark file
|
||||||
|
///
|
||||||
/// It is resumable from any step in case a crash/restart occurs.
|
/// It is resumable from any step in case a crash/restart occurs.
|
||||||
/// There are three entrypoints to the process:
|
/// There are three entrypoints to the process:
|
||||||
/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
|
/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
|
||||||
/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
|
/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
|
||||||
/// and we possibly neeed to continue deletion of remote files.
|
/// and we possibly neeed to continue deletion of remote files.
|
||||||
/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
|
/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
|
||||||
/// index but still have local metadata, timeline directory and delete mark.
|
/// index but still have local metadata, timeline directory and delete mark.
|
||||||
|
///
|
||||||
/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
|
/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub enum DeleteTimelineFlow {
|
pub enum DeleteTimelineFlow {
|
||||||
@@ -272,6 +274,7 @@ impl DeleteTimelineFlow {
|
|||||||
TimelineResources {
|
TimelineResources {
|
||||||
remote_client,
|
remote_client,
|
||||||
timeline_get_throttle: tenant.timeline_get_throttle.clone(),
|
timeline_get_throttle: tenant.timeline_get_throttle.clone(),
|
||||||
|
l0_flush_global_state: tenant.l0_flush_global_state.clone(),
|
||||||
},
|
},
|
||||||
// Important. We dont pass ancestor above because it can be missing.
|
// Important. We dont pass ancestor above because it can be missing.
|
||||||
// Thus we need to skip the validation here.
|
// Thus we need to skip the validation here.
|
||||||
|
|||||||
@@ -339,6 +339,10 @@ impl LayerManager {
|
|||||||
self.layer_fmgr.contains(layer)
|
self.layer_fmgr.contains(layer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||||
|
self.layer_fmgr.contains_key(key)
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
|
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
|
||||||
self.layer_fmgr.0.keys().cloned().collect_vec()
|
self.layer_fmgr.0.keys().cloned().collect_vec()
|
||||||
}
|
}
|
||||||
@@ -363,6 +367,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
|||||||
.clone()
|
.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||||
|
self.0.contains_key(key)
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn insert(&mut self, layer: T) {
|
pub(crate) fn insert(&mut self, layer: T) {
|
||||||
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
|
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
|
||||||
if present.is_some() && cfg!(debug_assertions) {
|
if present.is_some() && cfg!(debug_assertions) {
|
||||||
|
|||||||
@@ -11,11 +11,11 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
|||||||
/// Calculation consists of two stages:
|
/// Calculation consists of two stages:
|
||||||
///
|
///
|
||||||
/// 1. Initial size calculation. That might take a long time, because it requires
|
/// 1. Initial size calculation. That might take a long time, because it requires
|
||||||
/// reading all layers containing relation sizes at `initial_part_end`.
|
/// reading all layers containing relation sizes at `initial_part_end`.
|
||||||
///
|
///
|
||||||
/// 2. Collecting an incremental part and adding that to the initial size.
|
/// 2. Collecting an incremental part and adding that to the initial size.
|
||||||
/// Increments are appended on walreceiver writing new timeline data,
|
/// Increments are appended on walreceiver writing new timeline data,
|
||||||
/// which result in increase or decrease of the logical size.
|
/// which result in increase or decrease of the logical size.
|
||||||
pub(super) struct LogicalSize {
|
pub(super) struct LogicalSize {
|
||||||
/// Size, potentially slow to compute. Calculating this might require reading multiple
|
/// Size, potentially slow to compute. Calculating this might require reading multiple
|
||||||
/// layers, and even ancestor's layers.
|
/// layers, and even ancestor's layers.
|
||||||
@@ -45,17 +45,17 @@ pub(super) struct LogicalSize {
|
|||||||
/// Size shouldn't ever be negative, but this is signed for two reasons:
|
/// Size shouldn't ever be negative, but this is signed for two reasons:
|
||||||
///
|
///
|
||||||
/// 1. If we initialized the "baseline" size lazily, while we already
|
/// 1. If we initialized the "baseline" size lazily, while we already
|
||||||
/// process incoming WAL, the incoming WAL records could decrement the
|
/// process incoming WAL, the incoming WAL records could decrement the
|
||||||
/// variable and temporarily make it negative. (This is just future-proofing;
|
/// variable and temporarily make it negative. (This is just future-proofing;
|
||||||
/// the initialization is currently not done lazily.)
|
/// the initialization is currently not done lazily.)
|
||||||
///
|
///
|
||||||
/// 2. If there is a bug and we e.g. forget to increment it in some cases
|
/// 2. If there is a bug and we e.g. forget to increment it in some cases
|
||||||
/// when size grows, but remember to decrement it when it shrinks again, the
|
/// when size grows, but remember to decrement it when it shrinks again, the
|
||||||
/// variable could go negative. In that case, it seems better to at least
|
/// variable could go negative. In that case, it seems better to at least
|
||||||
/// try to keep tracking it, rather than clamp or overflow it. Note that
|
/// try to keep tracking it, rather than clamp or overflow it. Note that
|
||||||
/// get_current_logical_size() will clamp the returned value to zero if it's
|
/// get_current_logical_size() will clamp the returned value to zero if it's
|
||||||
/// negative, and log an error. Could set it permanently to zero or some
|
/// negative, and log an error. Could set it permanently to zero or some
|
||||||
/// special value to indicate "broken" instead, but this will do for now.
|
/// special value to indicate "broken" instead, but this will do for now.
|
||||||
///
|
///
|
||||||
/// Note that we also expose a copy of this value as a prometheus metric,
|
/// Note that we also expose a copy of this value as a prometheus metric,
|
||||||
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
|
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
|
||||||
|
|||||||
@@ -2,13 +2,13 @@
|
|||||||
//! To do so, a current implementation needs to do the following:
|
//! To do so, a current implementation needs to do the following:
|
||||||
//!
|
//!
|
||||||
//! * acknowledge the timelines that it needs to stream WAL into.
|
//! * acknowledge the timelines that it needs to stream WAL into.
|
||||||
//! Pageserver is able to dynamically (un)load tenants on attach and detach,
|
//! Pageserver is able to dynamically (un)load tenants on attach and detach,
|
||||||
//! hence WAL receiver needs to react on such events.
|
//! hence WAL receiver needs to react on such events.
|
||||||
//!
|
//!
|
||||||
//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
|
//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
|
||||||
//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
|
//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
|
||||||
//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
|
//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
|
||||||
//! Without this data, no WAL streaming is possible currently.
|
//! Without this data, no WAL streaming is possible currently.
|
||||||
//!
|
//!
|
||||||
//! Only one active WAL streaming connection is allowed at a time.
|
//! Only one active WAL streaming connection is allowed at a time.
|
||||||
//! The connection is supposed to be updated periodically, based on safekeeper timeline data.
|
//! The connection is supposed to be updated periodically, based on safekeeper timeline data.
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
|
|||||||
use super::TaskStateUpdate;
|
use super::TaskStateUpdate;
|
||||||
use crate::{
|
use crate::{
|
||||||
context::RequestContext,
|
context::RequestContext,
|
||||||
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
|
metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
|
||||||
task_mgr::TaskKind,
|
task_mgr::TaskKind,
|
||||||
task_mgr::WALRECEIVER_RUNTIME,
|
task_mgr::WALRECEIVER_RUNTIME,
|
||||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||||
@@ -208,14 +208,9 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
.instrument(tracing::info_span!("poller")),
|
.instrument(tracing::info_span!("poller")),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Immediately increment the gauge, then create a job to decrement it on task exit.
|
let _guard = LIVE_CONNECTIONS
|
||||||
// One of the pros of `defer!` is that this will *most probably*
|
.with_label_values(&["wal_receiver"])
|
||||||
// get called, even in presence of panics.
|
.guard();
|
||||||
let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
|
|
||||||
gauge.inc();
|
|
||||||
scopeguard::defer! {
|
|
||||||
gauge.dec();
|
|
||||||
}
|
|
||||||
|
|
||||||
let identify = identify_system(&replication_client).await?;
|
let identify = identify_system(&replication_client).await?;
|
||||||
info!("{identify:?}");
|
info!("{identify:?}");
|
||||||
|
|||||||
@@ -20,11 +20,13 @@ use std::num::NonZeroUsize;
|
|||||||
|
|
||||||
use bytes::BytesMut;
|
use bytes::BytesMut;
|
||||||
use pageserver_api::key::Key;
|
use pageserver_api::key::Key;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
use tokio_epoll_uring::BoundedBuf;
|
use tokio_epoll_uring::BoundedBuf;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
use utils::vec_map::VecMap;
|
use utils::vec_map::VecMap;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
|
use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
|
||||||
use crate::virtual_file::VirtualFile;
|
use crate::virtual_file::VirtualFile;
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||||
@@ -68,7 +70,7 @@ impl VectoredRead {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Eq, PartialEq)]
|
#[derive(Eq, PartialEq, Debug)]
|
||||||
pub(crate) enum VectoredReadExtended {
|
pub(crate) enum VectoredReadExtended {
|
||||||
Yes,
|
Yes,
|
||||||
No,
|
No,
|
||||||
@@ -91,7 +93,7 @@ impl VectoredReadBuilder {
|
|||||||
start_offset: u64,
|
start_offset: u64,
|
||||||
end_offset: u64,
|
end_offset: u64,
|
||||||
meta: BlobMeta,
|
meta: BlobMeta,
|
||||||
max_read_size: Option<usize>,
|
max_read_size: usize,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let mut blobs_at = VecMap::default();
|
let mut blobs_at = VecMap::default();
|
||||||
blobs_at
|
blobs_at
|
||||||
@@ -102,10 +104,9 @@ impl VectoredReadBuilder {
|
|||||||
start: start_offset,
|
start: start_offset,
|
||||||
end: end_offset,
|
end: end_offset,
|
||||||
blobs_at,
|
blobs_at,
|
||||||
max_read_size,
|
max_read_size: Some(max_read_size),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Attempt to extend the current read with a new blob if the start
|
/// Attempt to extend the current read with a new blob if the start
|
||||||
/// offset matches with the current end of the vectored read
|
/// offset matches with the current end of the vectored read
|
||||||
/// and the resuting size is below the max read size
|
/// and the resuting size is below the max read size
|
||||||
@@ -164,7 +165,7 @@ pub struct VectoredReadPlanner {
|
|||||||
// Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
|
// Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
|
||||||
prev: Option<(Key, Lsn, u64, BlobFlag)>,
|
prev: Option<(Key, Lsn, u64, BlobFlag)>,
|
||||||
|
|
||||||
max_read_size: Option<usize>,
|
max_read_size: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl VectoredReadPlanner {
|
impl VectoredReadPlanner {
|
||||||
@@ -172,20 +173,7 @@ impl VectoredReadPlanner {
|
|||||||
Self {
|
Self {
|
||||||
blobs: BTreeMap::new(),
|
blobs: BTreeMap::new(),
|
||||||
prev: None,
|
prev: None,
|
||||||
max_read_size: Some(max_read_size),
|
max_read_size,
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This function should *only* be used if the caller has a way to control the limit. e.g., in [`StreamingVectoredReadPlanner`],
|
|
||||||
/// it uses the vectored read planner to avoid duplicated logic on handling blob start/end, while expecting the vectored
|
|
||||||
/// read planner to give a single read to a continuous range of bytes in the image layer. Therefore, it does not need the
|
|
||||||
/// code path to split reads into chunks of `max_read_size`, and controls the read size itself.
|
|
||||||
#[cfg(test)]
|
|
||||||
pub(crate) fn new_caller_controlled_max_limit() -> Self {
|
|
||||||
Self {
|
|
||||||
blobs: BTreeMap::new(),
|
|
||||||
prev: None,
|
|
||||||
max_read_size: None,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -203,9 +191,9 @@ impl VectoredReadPlanner {
|
|||||||
///
|
///
|
||||||
/// The `flag` argument has two interesting values:
|
/// The `flag` argument has two interesting values:
|
||||||
/// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
|
/// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
|
||||||
/// This is used for WAL records that `will_init`.
|
/// This is used for WAL records that `will_init`.
|
||||||
/// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
|
/// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
|
||||||
/// if the blob is cached.
|
/// if the blob is cached.
|
||||||
pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
|
pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
|
||||||
// Implementation note: internally lag behind by one blob such that
|
// Implementation note: internally lag behind by one blob such that
|
||||||
// we have a start and end offset when initialising [`VectoredRead`]
|
// we have a start and end offset when initialising [`VectoredRead`]
|
||||||
@@ -315,7 +303,7 @@ impl<'a> VectoredBlobReader<'a> {
|
|||||||
read.size(),
|
read.size(),
|
||||||
buf.capacity()
|
buf.capacity()
|
||||||
);
|
);
|
||||||
let buf = self
|
let mut buf = self
|
||||||
.file
|
.file
|
||||||
.read_exact_at(buf.slice(0..read.size()), read.start, ctx)
|
.read_exact_at(buf.slice(0..read.size()), read.start, ctx)
|
||||||
.await?
|
.await?
|
||||||
@@ -337,38 +325,68 @@ impl<'a> VectoredBlobReader<'a> {
|
|||||||
.chain(std::iter::once(None)),
|
.chain(std::iter::once(None)),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Some scratch space, put here for reusing the allocation
|
||||||
|
let mut decompressed_vec = Vec::new();
|
||||||
|
|
||||||
for ((offset, meta), next) in pairs {
|
for ((offset, meta), next) in pairs {
|
||||||
let offset_in_buf = offset - start_offset;
|
let offset_in_buf = offset - start_offset;
|
||||||
let first_len_byte = buf[offset_in_buf as usize];
|
let first_len_byte = buf[offset_in_buf as usize];
|
||||||
|
|
||||||
// Each blob is prefixed by a header containing it's size.
|
// Each blob is prefixed by a header containing its size and compression information.
|
||||||
// Extract the size and skip that header to find the start of the data.
|
// Extract the size and skip that header to find the start of the data.
|
||||||
// The size can be 1 or 4 bytes. The most significant bit is 0 in the
|
// The size can be 1 or 4 bytes. The most significant bit is 0 in the
|
||||||
// 1 byte case and 1 in the 4 byte case.
|
// 1 byte case and 1 in the 4 byte case.
|
||||||
let (size_length, blob_size) = if first_len_byte < 0x80 {
|
let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 {
|
||||||
(1, first_len_byte as u64)
|
(1, first_len_byte as u64, BYTE_UNCOMPRESSED)
|
||||||
} else {
|
} else {
|
||||||
let mut blob_size_buf = [0u8; 4];
|
let mut blob_size_buf = [0u8; 4];
|
||||||
let offset_in_buf = offset_in_buf as usize;
|
let offset_in_buf = offset_in_buf as usize;
|
||||||
|
|
||||||
blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
|
blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
|
||||||
blob_size_buf[0] &= 0x7f;
|
blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
|
||||||
(4, u32::from_be_bytes(blob_size_buf) as u64)
|
|
||||||
|
let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
|
||||||
|
(
|
||||||
|
4,
|
||||||
|
u32::from_be_bytes(blob_size_buf) as u64,
|
||||||
|
compression_bits,
|
||||||
|
)
|
||||||
};
|
};
|
||||||
|
|
||||||
let start = offset_in_buf + size_length;
|
let start_raw = offset_in_buf + size_length;
|
||||||
let end = match next {
|
let end_raw = match next {
|
||||||
Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
|
Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
|
||||||
None => start + blob_size,
|
None => start_raw + blob_size,
|
||||||
};
|
};
|
||||||
|
assert_eq!(end_raw - start_raw, blob_size);
|
||||||
assert_eq!(end - start, blob_size);
|
let (start, end);
|
||||||
|
if compression_bits == BYTE_UNCOMPRESSED {
|
||||||
|
start = start_raw as usize;
|
||||||
|
end = end_raw as usize;
|
||||||
|
} else if compression_bits == BYTE_ZSTD {
|
||||||
|
let mut decoder =
|
||||||
|
async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
|
||||||
|
decoder
|
||||||
|
.write_all(&buf[start_raw as usize..end_raw as usize])
|
||||||
|
.await?;
|
||||||
|
decoder.flush().await?;
|
||||||
|
start = buf.len();
|
||||||
|
buf.extend_from_slice(&decompressed_vec);
|
||||||
|
end = buf.len();
|
||||||
|
decompressed_vec.clear();
|
||||||
|
} else {
|
||||||
|
let error = std::io::Error::new(
|
||||||
|
std::io::ErrorKind::InvalidData,
|
||||||
|
format!("invalid compression byte {compression_bits:x}"),
|
||||||
|
);
|
||||||
|
return Err(error);
|
||||||
|
}
|
||||||
|
|
||||||
metas.push(VectoredBlob {
|
metas.push(VectoredBlob {
|
||||||
start: start as usize,
|
start,
|
||||||
end: end as usize,
|
end,
|
||||||
meta: *meta,
|
meta: *meta,
|
||||||
})
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(VectoredBlobsBuf { buf, blobs: metas })
|
Ok(VectoredBlobsBuf { buf, blobs: metas })
|
||||||
@@ -376,17 +394,18 @@ impl<'a> VectoredBlobReader<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
|
/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
|
||||||
/// getting read blobs. It returns a batch when `handle` gets called and when the current key would exceed the read_size and
|
/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
|
||||||
/// max_cnt constraints. Underlying it uses [`VectoredReadPlanner`].
|
/// max_cnt constraints.
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub struct StreamingVectoredReadPlanner {
|
pub struct StreamingVectoredReadPlanner {
|
||||||
planner: VectoredReadPlanner,
|
read_builder: Option<VectoredReadBuilder>,
|
||||||
/// Max read size per batch
|
// Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
|
||||||
|
prev: Option<(Key, Lsn, u64)>,
|
||||||
|
/// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
|
||||||
|
/// we will produce a single batch instead of split them.
|
||||||
max_read_size: u64,
|
max_read_size: u64,
|
||||||
/// Max item count per batch
|
/// Max item count per batch
|
||||||
max_cnt: usize,
|
max_cnt: usize,
|
||||||
/// The first offset of this batch
|
|
||||||
this_batch_first_offset: Option<u64>,
|
|
||||||
/// Size of the current batch
|
/// Size of the current batch
|
||||||
cnt: usize,
|
cnt: usize,
|
||||||
}
|
}
|
||||||
@@ -397,67 +416,100 @@ impl StreamingVectoredReadPlanner {
|
|||||||
assert!(max_cnt > 0);
|
assert!(max_cnt > 0);
|
||||||
assert!(max_read_size > 0);
|
assert!(max_read_size > 0);
|
||||||
Self {
|
Self {
|
||||||
// We want to have exactly one read syscall (plus several others for index lookup) for each `next_batch` call.
|
read_builder: None,
|
||||||
// Therefore, we enforce `self.max_read_size` by ourselves instead of using the VectoredReadPlanner's capability,
|
prev: None,
|
||||||
// to avoid splitting into two I/Os.
|
|
||||||
planner: VectoredReadPlanner::new_caller_controlled_max_limit(),
|
|
||||||
max_cnt,
|
max_cnt,
|
||||||
max_read_size,
|
max_read_size,
|
||||||
this_batch_first_offset: None,
|
|
||||||
cnt: 0,
|
cnt: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn emit(&mut self, this_batch_first_offset: u64) -> VectoredRead {
|
pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option<VectoredRead> {
|
||||||
let planner = std::mem::replace(
|
// Implementation note: internally lag behind by one blob such that
|
||||||
&mut self.planner,
|
// we have a start and end offset when initialising [`VectoredRead`]
|
||||||
VectoredReadPlanner::new_caller_controlled_max_limit(),
|
let (prev_key, prev_lsn, prev_offset) = match self.prev {
|
||||||
);
|
None => {
|
||||||
self.this_batch_first_offset = Some(this_batch_first_offset);
|
self.prev = Some((key, lsn, offset));
|
||||||
self.cnt = 1;
|
return None;
|
||||||
let mut batch = planner.finish();
|
}
|
||||||
assert_eq!(batch.len(), 1, "should have exactly one read batch");
|
Some(prev) => prev,
|
||||||
batch.pop().unwrap()
|
};
|
||||||
|
|
||||||
|
let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false);
|
||||||
|
|
||||||
|
self.prev = Some((key, lsn, offset));
|
||||||
|
|
||||||
|
res
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn handle(
|
pub fn handle_range_end(&mut self, offset: u64) -> Option<VectoredRead> {
|
||||||
|
let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev {
|
||||||
|
self.add_blob(prev_key, prev_lsn, prev_offset, offset, true)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
self.prev = None;
|
||||||
|
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
|
fn add_blob(
|
||||||
&mut self,
|
&mut self,
|
||||||
key: Key,
|
key: Key,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
offset: u64,
|
start_offset: u64,
|
||||||
flag: BlobFlag,
|
end_offset: u64,
|
||||||
|
is_last_blob_in_read: bool,
|
||||||
) -> Option<VectoredRead> {
|
) -> Option<VectoredRead> {
|
||||||
if let Some(begin_offset) = self.this_batch_first_offset {
|
match &mut self.read_builder {
|
||||||
// Each batch will have at least one item b/c `self.this_batch_first_offset` is set
|
Some(read_builder) => {
|
||||||
// after one item gets processed
|
let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
|
||||||
if offset - begin_offset > self.max_read_size {
|
assert_eq!(extended, VectoredReadExtended::Yes);
|
||||||
self.planner.handle_range_end(offset); // End the current batch with the offset
|
|
||||||
let batch = self.emit(offset); // Produce a batch
|
|
||||||
self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
|
|
||||||
return Some(batch);
|
|
||||||
}
|
}
|
||||||
} else {
|
None => {
|
||||||
self.this_batch_first_offset = Some(offset)
|
self.read_builder = {
|
||||||
}
|
let mut blobs_at = VecMap::default();
|
||||||
if self.cnt >= self.max_cnt {
|
blobs_at
|
||||||
self.planner.handle_range_end(offset); // End the current batch with the offset
|
.append(start_offset, BlobMeta { key, lsn })
|
||||||
let batch = self.emit(offset); // Produce a batch
|
.expect("First insertion always succeeds");
|
||||||
self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
|
|
||||||
return Some(batch);
|
|
||||||
}
|
|
||||||
self.planner.handle(key, lsn, offset, flag); // Add this key to the current batch
|
|
||||||
self.cnt += 1;
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn handle_range_end(&mut self, offset: u64) -> VectoredRead {
|
Some(VectoredReadBuilder {
|
||||||
self.planner.handle_range_end(offset);
|
start: start_offset,
|
||||||
self.emit(offset)
|
end: end_offset,
|
||||||
|
blobs_at,
|
||||||
|
max_read_size: None,
|
||||||
|
})
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let read_builder = self.read_builder.as_mut().unwrap();
|
||||||
|
self.cnt += 1;
|
||||||
|
if is_last_blob_in_read
|
||||||
|
|| read_builder.size() >= self.max_read_size as usize
|
||||||
|
|| self.cnt >= self.max_cnt
|
||||||
|
{
|
||||||
|
let prev_read_builder = self.read_builder.take();
|
||||||
|
self.cnt = 0;
|
||||||
|
|
||||||
|
// `current_read_builder` is None in the first iteration
|
||||||
|
if let Some(read_builder) = prev_read_builder {
|
||||||
|
return Some(read_builder.build());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use anyhow::Error;
|
||||||
|
|
||||||
|
use crate::context::DownloadBehavior;
|
||||||
|
use crate::page_cache::PAGE_SZ;
|
||||||
|
use crate::task_mgr::TaskKind;
|
||||||
|
|
||||||
|
use super::super::blob_io::tests::{random_array, write_maybe_compressed};
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
|
fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
|
||||||
@@ -509,8 +561,11 @@ mod tests {
|
|||||||
planner.handle_range_end(652 * 1024);
|
planner.handle_range_end(652 * 1024);
|
||||||
|
|
||||||
let reads = planner.finish();
|
let reads = planner.finish();
|
||||||
|
|
||||||
assert_eq!(reads.len(), 6);
|
assert_eq!(reads.len(), 6);
|
||||||
|
|
||||||
|
// TODO: could remove zero reads to produce 5 reads here
|
||||||
|
|
||||||
for (idx, read) in reads.iter().enumerate() {
|
for (idx, read) in reads.iter().enumerate() {
|
||||||
validate_read(read, ranges[idx]);
|
validate_read(read, ranges[idx]);
|
||||||
}
|
}
|
||||||
@@ -548,4 +603,187 @@ mod tests {
|
|||||||
validate_read(read, ranges[idx]);
|
validate_read(read, ranges[idx]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn streaming_planner_max_read_size_test() {
|
||||||
|
let max_read_size = 128 * 1024;
|
||||||
|
let key = Key::MIN;
|
||||||
|
let lsn = Lsn(0);
|
||||||
|
|
||||||
|
let blob_descriptions = vec![
|
||||||
|
(key, lsn, 0, BlobFlag::None),
|
||||||
|
(key, lsn, 32 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 96 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 128 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 198 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 268 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 396 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 652 * 1024, BlobFlag::None),
|
||||||
|
];
|
||||||
|
|
||||||
|
let ranges = [
|
||||||
|
&blob_descriptions[0..3],
|
||||||
|
&blob_descriptions[3..5],
|
||||||
|
&blob_descriptions[5..6],
|
||||||
|
&blob_descriptions[6..7],
|
||||||
|
&blob_descriptions[7..],
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000);
|
||||||
|
let mut reads = Vec::new();
|
||||||
|
for (key, lsn, offset, _) in blob_descriptions.clone() {
|
||||||
|
reads.extend(planner.handle(key, lsn, offset));
|
||||||
|
}
|
||||||
|
reads.extend(planner.handle_range_end(652 * 1024));
|
||||||
|
|
||||||
|
assert_eq!(reads.len(), ranges.len());
|
||||||
|
|
||||||
|
for (idx, read) in reads.iter().enumerate() {
|
||||||
|
validate_read(read, ranges[idx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn streaming_planner_max_cnt_test() {
|
||||||
|
let max_read_size = 1024 * 1024;
|
||||||
|
let key = Key::MIN;
|
||||||
|
let lsn = Lsn(0);
|
||||||
|
|
||||||
|
let blob_descriptions = vec![
|
||||||
|
(key, lsn, 0, BlobFlag::None),
|
||||||
|
(key, lsn, 32 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 96 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 128 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 198 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 268 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 396 * 1024, BlobFlag::None),
|
||||||
|
(key, lsn, 652 * 1024, BlobFlag::None),
|
||||||
|
];
|
||||||
|
|
||||||
|
let ranges = [
|
||||||
|
&blob_descriptions[0..2],
|
||||||
|
&blob_descriptions[2..4],
|
||||||
|
&blob_descriptions[4..6],
|
||||||
|
&blob_descriptions[6..],
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
|
||||||
|
let mut reads = Vec::new();
|
||||||
|
for (key, lsn, offset, _) in blob_descriptions.clone() {
|
||||||
|
reads.extend(planner.handle(key, lsn, offset));
|
||||||
|
}
|
||||||
|
reads.extend(planner.handle_range_end(652 * 1024));
|
||||||
|
|
||||||
|
assert_eq!(reads.len(), ranges.len());
|
||||||
|
|
||||||
|
for (idx, read) in reads.iter().enumerate() {
|
||||||
|
validate_read(read, ranges[idx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn streaming_planner_edge_test() {
|
||||||
|
let max_read_size = 1024 * 1024;
|
||||||
|
let key = Key::MIN;
|
||||||
|
let lsn = Lsn(0);
|
||||||
|
{
|
||||||
|
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
|
||||||
|
let mut reads = Vec::new();
|
||||||
|
reads.extend(planner.handle_range_end(652 * 1024));
|
||||||
|
assert!(reads.is_empty());
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
|
||||||
|
let mut reads = Vec::new();
|
||||||
|
reads.extend(planner.handle(key, lsn, 0));
|
||||||
|
reads.extend(planner.handle_range_end(652 * 1024));
|
||||||
|
assert_eq!(reads.len(), 1);
|
||||||
|
validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
|
||||||
|
let mut reads = Vec::new();
|
||||||
|
reads.extend(planner.handle(key, lsn, 0));
|
||||||
|
reads.extend(planner.handle(key, lsn, 128 * 1024));
|
||||||
|
reads.extend(planner.handle_range_end(652 * 1024));
|
||||||
|
assert_eq!(reads.len(), 2);
|
||||||
|
validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
|
||||||
|
validate_read(&reads[1], &[(key, lsn, 128 * 1024, BlobFlag::None)]);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
|
||||||
|
let mut reads = Vec::new();
|
||||||
|
reads.extend(planner.handle(key, lsn, 0));
|
||||||
|
reads.extend(planner.handle(key, lsn, 128 * 1024));
|
||||||
|
reads.extend(planner.handle_range_end(652 * 1024));
|
||||||
|
assert_eq!(reads.len(), 1);
|
||||||
|
validate_read(
|
||||||
|
&reads[0],
|
||||||
|
&[
|
||||||
|
(key, lsn, 0, BlobFlag::None),
|
||||||
|
(key, lsn, 128 * 1024, BlobFlag::None),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
|
||||||
|
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||||
|
let (_temp_dir, pathbuf, offsets) =
|
||||||
|
write_maybe_compressed::<true>(blobs, compression, &ctx).await?;
|
||||||
|
|
||||||
|
let file = VirtualFile::open(&pathbuf, &ctx).await?;
|
||||||
|
let file_len = std::fs::metadata(&pathbuf)?.len();
|
||||||
|
|
||||||
|
// Multiply by two (compressed data might need more space), and add a few bytes for the header
|
||||||
|
let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
|
||||||
|
let mut buf = BytesMut::with_capacity(reserved_bytes);
|
||||||
|
|
||||||
|
let vectored_blob_reader = VectoredBlobReader::new(&file);
|
||||||
|
let meta = BlobMeta {
|
||||||
|
key: Key::MIN,
|
||||||
|
lsn: Lsn(0),
|
||||||
|
};
|
||||||
|
|
||||||
|
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
|
||||||
|
let end = offsets.get(idx + 1).unwrap_or(&file_len);
|
||||||
|
if idx + 1 == offsets.len() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
|
||||||
|
let read = read_builder.build();
|
||||||
|
let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
|
||||||
|
assert_eq!(result.blobs.len(), 1);
|
||||||
|
let read_blob = &result.blobs[0];
|
||||||
|
let read_buf = &result.buf[read_blob.start..read_blob.end];
|
||||||
|
assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
|
||||||
|
buf = result.buf;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_really_big_array() -> Result<(), Error> {
|
||||||
|
let blobs = &[
|
||||||
|
b"test".to_vec(),
|
||||||
|
random_array(10 * PAGE_SZ),
|
||||||
|
b"hello".to_vec(),
|
||||||
|
random_array(66 * PAGE_SZ),
|
||||||
|
vec![0xf3; 24 * PAGE_SZ],
|
||||||
|
b"foobar".to_vec(),
|
||||||
|
];
|
||||||
|
round_trip_test_compressed(blobs, false).await?;
|
||||||
|
round_trip_test_compressed(blobs, true).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_arrays_inc() -> Result<(), Error> {
|
||||||
|
let blobs = (0..PAGE_SZ / 8)
|
||||||
|
.map(|v| random_array(v * 16))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
round_trip_test_compressed(&blobs, false).await?;
|
||||||
|
round_trip_test_compressed(&blobs, true).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,36 +0,0 @@
|
|||||||
use bytes::Bytes;
|
|
||||||
use camino::Utf8PathBuf;
|
|
||||||
use std::{
|
|
||||||
fs::{create_dir_all, File},
|
|
||||||
io::{BufWriter, Write},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct Tracer {
|
|
||||||
writer: BufWriter<File>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for Tracer {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
self.flush()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Tracer {
|
|
||||||
pub fn new(path: Utf8PathBuf) -> Self {
|
|
||||||
let parent = path.parent().expect("failed to parse parent path");
|
|
||||||
create_dir_all(parent).expect("failed to create trace dir");
|
|
||||||
|
|
||||||
let file = File::create(path).expect("failed to create trace file");
|
|
||||||
Tracer {
|
|
||||||
writer: BufWriter::new(file),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn trace(&mut self, msg: &Bytes) {
|
|
||||||
self.writer.write_all(msg).expect("failed to write trace");
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn flush(&mut self) {
|
|
||||||
self.writer.flush().expect("failed to flush trace file");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -33,6 +33,7 @@ pub struct BufferedWriter<B, W> {
|
|||||||
/// invariant: always remains Some(buf) except
|
/// invariant: always remains Some(buf) except
|
||||||
/// - while IO is ongoing => goes back to Some() once the IO completed successfully
|
/// - while IO is ongoing => goes back to Some() once the IO completed successfully
|
||||||
/// - after an IO error => stays `None` forever
|
/// - after an IO error => stays `None` forever
|
||||||
|
///
|
||||||
/// In these exceptional cases, it's `None`.
|
/// In these exceptional cases, it's `None`.
|
||||||
buf: Option<B>,
|
buf: Option<B>,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -343,7 +343,33 @@ impl WalIngest {
|
|||||||
xlog_checkpoint.oldestActiveXid,
|
xlog_checkpoint.oldestActiveXid,
|
||||||
self.checkpoint.oldestActiveXid
|
self.checkpoint.oldestActiveXid
|
||||||
);
|
);
|
||||||
self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
|
|
||||||
|
// A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
|
||||||
|
// because at shutdown, all in-progress transactions will implicitly
|
||||||
|
// end. Postgres startup code knows that, and allows hot standby to start
|
||||||
|
// immediately from a shutdown checkpoint.
|
||||||
|
//
|
||||||
|
// In Neon, Postgres hot standby startup always behaves as if starting from
|
||||||
|
// an online checkpoint. It needs a valid `oldestActiveXid` value, so
|
||||||
|
// instead of overwriting self.checkpoint.oldestActiveXid with
|
||||||
|
// InvalidTransactionid from the checkpoint WAL record, update it to a
|
||||||
|
// proper value, knowing that there are no in-progress transactions at this
|
||||||
|
// point, except for prepared transactions.
|
||||||
|
//
|
||||||
|
// See also the neon code changes in the InitWalRecovery() function.
|
||||||
|
if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
|
||||||
|
&& info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||||
|
{
|
||||||
|
let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
|
||||||
|
for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
|
||||||
|
if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
|
||||||
|
oldest_active_xid = xid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.checkpoint.oldestActiveXid = oldest_active_xid;
|
||||||
|
} else {
|
||||||
|
self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
|
||||||
|
}
|
||||||
|
|
||||||
// Write a new checkpoint key-value pair on every checkpoint record, even
|
// Write a new checkpoint key-value pair on every checkpoint record, even
|
||||||
// if nothing really changed. Not strictly required, but it seems nice to
|
// if nothing really changed. Not strictly required, but it seems nice to
|
||||||
@@ -375,6 +401,7 @@ impl WalIngest {
|
|||||||
if info == pg_constants::XLOG_RUNNING_XACTS {
|
if info == pg_constants::XLOG_RUNNING_XACTS {
|
||||||
let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
|
let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
|
||||||
self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
|
self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
|
||||||
|
self.checkpoint_modified = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pg_constants::RM_REPLORIGIN_ID => {
|
pg_constants::RM_REPLORIGIN_ID => {
|
||||||
@@ -1277,13 +1304,10 @@ impl WalIngest {
|
|||||||
xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
|
xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
|
||||||
);
|
);
|
||||||
|
|
||||||
// Here we treat oldestXid and oldestXidDB
|
// In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is
|
||||||
// differently from postgres redo routines.
|
// truncated, but a checkpoint record with the updated values isn't written until
|
||||||
// In postgres checkpoint.oldestXid lags behind xlrec.oldest_xid
|
// later. In Neon, a server can start at any LSN, not just on a checkpoint record,
|
||||||
// until checkpoint happens and updates the value.
|
// so we keep the oldestXid and oldestXidDB up-to-date.
|
||||||
// Here we can use the most recent value.
|
|
||||||
// It's just an optimization, though and can be deleted.
|
|
||||||
// TODO Figure out if there will be any issues with replica.
|
|
||||||
self.checkpoint.oldestXid = xlrec.oldest_xid;
|
self.checkpoint.oldestXid = xlrec.oldest_xid;
|
||||||
self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
|
self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
|
||||||
self.checkpoint_modified = true;
|
self.checkpoint_modified = true;
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ OBJS = \
|
|||||||
$(WIN32RES) \
|
$(WIN32RES) \
|
||||||
extension_server.o \
|
extension_server.o \
|
||||||
file_cache.o \
|
file_cache.o \
|
||||||
|
hll.o \
|
||||||
libpagestore.o \
|
libpagestore.o \
|
||||||
neon.o \
|
neon.o \
|
||||||
neon_utils.o \
|
neon_utils.o \
|
||||||
@@ -22,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
|||||||
SHLIB_LINK = -lcurl
|
SHLIB_LINK = -lcurl
|
||||||
|
|
||||||
EXTENSION = neon
|
EXTENSION = neon
|
||||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
|
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql
|
||||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||||
|
|
||||||
EXTRA_CLEAN = \
|
EXTRA_CLEAN = \
|
||||||
|
|||||||
@@ -26,7 +26,6 @@
|
|||||||
#include "miscadmin.h"
|
#include "miscadmin.h"
|
||||||
#include "pagestore_client.h"
|
#include "pagestore_client.h"
|
||||||
#include "common/hashfn.h"
|
#include "common/hashfn.h"
|
||||||
#include "lib/hyperloglog.h"
|
|
||||||
#include "pgstat.h"
|
#include "pgstat.h"
|
||||||
#include "postmaster/bgworker.h"
|
#include "postmaster/bgworker.h"
|
||||||
#include RELFILEINFO_HDR
|
#include RELFILEINFO_HDR
|
||||||
@@ -40,6 +39,8 @@
|
|||||||
#include "utils/dynahash.h"
|
#include "utils/dynahash.h"
|
||||||
#include "utils/guc.h"
|
#include "utils/guc.h"
|
||||||
|
|
||||||
|
#include "hll.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Local file cache is used to temporary store relations pages in local file system.
|
* Local file cache is used to temporary store relations pages in local file system.
|
||||||
* All blocks of all relations are stored inside one file and addressed using shared hash map.
|
* All blocks of all relations are stored inside one file and addressed using shared hash map.
|
||||||
@@ -62,7 +63,6 @@
|
|||||||
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
|
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
|
||||||
#define MB ((uint64)1024*1024)
|
#define MB ((uint64)1024*1024)
|
||||||
|
|
||||||
#define HYPER_LOG_LOG_BIT_WIDTH 10
|
|
||||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
|
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
|
||||||
|
|
||||||
typedef struct FileCacheEntry
|
typedef struct FileCacheEntry
|
||||||
@@ -87,8 +87,7 @@ typedef struct FileCacheControl
|
|||||||
uint64 writes;
|
uint64 writes;
|
||||||
dlist_head lru; /* double linked list for LRU replacement
|
dlist_head lru; /* double linked list for LRU replacement
|
||||||
* algorithm */
|
* algorithm */
|
||||||
hyperLogLogState wss_estimation; /* estimation of wroking set size */
|
HyperLogLogState wss_estimation; /* estimation of working set size */
|
||||||
uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
|
|
||||||
} FileCacheControl;
|
} FileCacheControl;
|
||||||
|
|
||||||
static HTAB *lfc_hash;
|
static HTAB *lfc_hash;
|
||||||
@@ -238,12 +237,7 @@ lfc_shmem_startup(void)
|
|||||||
dlist_init(&lfc_ctl->lru);
|
dlist_init(&lfc_ctl->lru);
|
||||||
|
|
||||||
/* Initialize hyper-log-log structure for estimating working set size */
|
/* Initialize hyper-log-log structure for estimating working set size */
|
||||||
initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
|
initSHLL(&lfc_ctl->wss_estimation);
|
||||||
|
|
||||||
/* We need hashes in shared memory */
|
|
||||||
pfree(lfc_ctl->wss_estimation.hashesArr);
|
|
||||||
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
|
|
||||||
lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
|
|
||||||
|
|
||||||
/* Recreate file cache on restart */
|
/* Recreate file cache on restart */
|
||||||
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
|
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
|
||||||
@@ -545,7 +539,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
|
|
||||||
/* Approximate working set */
|
/* Approximate working set */
|
||||||
tag.blockNum = blkno;
|
tag.blockNum = blkno;
|
||||||
addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
|
addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
|
||||||
|
|
||||||
if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
|
if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
|
||||||
{
|
{
|
||||||
@@ -986,20 +980,38 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
|||||||
SRF_RETURN_DONE(funcctx);
|
SRF_RETURN_DONE(funcctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
|
||||||
|
|
||||||
|
Datum
|
||||||
|
approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
if (lfc_size_limit != 0)
|
||||||
|
{
|
||||||
|
int32 dc;
|
||||||
|
time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
|
||||||
|
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||||
|
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
|
||||||
|
LWLockRelease(lfc_lock);
|
||||||
|
PG_RETURN_INT32(dc);
|
||||||
|
}
|
||||||
|
PG_RETURN_NULL();
|
||||||
|
}
|
||||||
|
|
||||||
PG_FUNCTION_INFO_V1(approximate_working_set_size);
|
PG_FUNCTION_INFO_V1(approximate_working_set_size);
|
||||||
|
|
||||||
Datum
|
Datum
|
||||||
approximate_working_set_size(PG_FUNCTION_ARGS)
|
approximate_working_set_size(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
int32 dc = -1;
|
|
||||||
if (lfc_size_limit != 0)
|
if (lfc_size_limit != 0)
|
||||||
{
|
{
|
||||||
|
int32 dc;
|
||||||
bool reset = PG_GETARG_BOOL(0);
|
bool reset = PG_GETARG_BOOL(0);
|
||||||
LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
|
LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
|
||||||
dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
|
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
|
||||||
if (reset)
|
if (reset)
|
||||||
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
|
memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
|
||||||
LWLockRelease(lfc_lock);
|
LWLockRelease(lfc_lock);
|
||||||
|
PG_RETURN_INT32(dc);
|
||||||
}
|
}
|
||||||
PG_RETURN_INT32(dc);
|
PG_RETURN_NULL();
|
||||||
}
|
}
|
||||||
|
|||||||
193
pgxn/neon/hll.c
Normal file
193
pgxn/neon/hll.c
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* hll.c
|
||||||
|
* Sliding HyperLogLog cardinality estimator
|
||||||
|
*
|
||||||
|
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
|
||||||
|
*
|
||||||
|
* Implements https://hal.science/hal-00465313/document
|
||||||
|
*
|
||||||
|
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally
|
||||||
|
* suited to estimating the cardinality of very large sets; in particular, we
|
||||||
|
* have not attempted to further optimize the implementation as described in
|
||||||
|
* the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
|
||||||
|
* Engineering of a State of The Art Cardinality Estimation Algorithm".
|
||||||
|
*
|
||||||
|
* A sparse representation of HyperLogLog state is used, with fixed space
|
||||||
|
* overhead.
|
||||||
|
*
|
||||||
|
* The copyright terms of Ohno's original version (the MIT license) follow.
|
||||||
|
*
|
||||||
|
* IDENTIFICATION
|
||||||
|
* src/backend/lib/hyperloglog.c
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the 'Software'), to
|
||||||
|
* deal in the Software without restriction, including without limitation the
|
||||||
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||||
|
* sell copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||||
|
* IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "postgres.h"
|
||||||
|
#include "funcapi.h"
|
||||||
|
#include "port/pg_bitutils.h"
|
||||||
|
#include "utils/timestamp.h"
|
||||||
|
#include "hll.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define POW_2_32 (4294967296.0)
|
||||||
|
#define NEG_POW_2_32 (-4294967296.0)
|
||||||
|
|
||||||
|
#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Worker for addHyperLogLog().
|
||||||
|
*
|
||||||
|
* Calculates the position of the first set bit in first b bits of x argument
|
||||||
|
* starting from the first, reading from most significant to least significant
|
||||||
|
* bits.
|
||||||
|
*
|
||||||
|
* Example (when considering fist 10 bits of x):
|
||||||
|
*
|
||||||
|
* rho(x = 0b1000000000) returns 1
|
||||||
|
* rho(x = 0b0010000000) returns 3
|
||||||
|
* rho(x = 0b0000000000) returns b + 1
|
||||||
|
*
|
||||||
|
* "The binary address determined by the first b bits of x"
|
||||||
|
*
|
||||||
|
* Return value "j" used to index bit pattern to watch.
|
||||||
|
*/
|
||||||
|
static inline uint8
|
||||||
|
rho(uint32 x, uint8 b)
|
||||||
|
{
|
||||||
|
uint8 j = 1;
|
||||||
|
|
||||||
|
if (x == 0)
|
||||||
|
return b + 1;
|
||||||
|
|
||||||
|
j = 32 - pg_leftmost_one_pos32(x);
|
||||||
|
|
||||||
|
if (j > b)
|
||||||
|
return b + 1;
|
||||||
|
|
||||||
|
return j;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize HyperLogLog track state
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
initSHLL(HyperLogLogState *cState)
|
||||||
|
{
|
||||||
|
memset(cState->regs, 0, sizeof(cState->regs));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Adds element to the estimator, from caller-supplied hash.
|
||||||
|
*
|
||||||
|
* It is critical that the hash value passed be an actual hash value, typically
|
||||||
|
* generated using hash_any(). The algorithm relies on a specific bit-pattern
|
||||||
|
* observable in conjunction with stochastic averaging. There must be a
|
||||||
|
* uniform distribution of bits in hash values for each distinct original value
|
||||||
|
* observed.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
addSHLL(HyperLogLogState *cState, uint32 hash)
|
||||||
|
{
|
||||||
|
uint8 count;
|
||||||
|
uint32 index;
|
||||||
|
size_t i;
|
||||||
|
size_t j;
|
||||||
|
|
||||||
|
TimestampTz now = GetCurrentTimestamp();
|
||||||
|
/* Use the first "k" (registerWidth) bits as a zero based index */
|
||||||
|
index = hash >> HLL_C_BITS;
|
||||||
|
|
||||||
|
/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
|
||||||
|
count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
|
||||||
|
|
||||||
|
cState->regs[index][count] = now;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint8
|
||||||
|
getMaximum(const TimestampTz* reg, TimestampTz since)
|
||||||
|
{
|
||||||
|
uint8 max = 0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < HLL_C_BITS + 1; i++)
|
||||||
|
{
|
||||||
|
if (reg[i] >= since)
|
||||||
|
{
|
||||||
|
max = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return max;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Estimates cardinality, based on elements added so far
|
||||||
|
*/
|
||||||
|
double
|
||||||
|
estimateSHLL(HyperLogLogState *cState, time_t duration)
|
||||||
|
{
|
||||||
|
double result;
|
||||||
|
double sum = 0.0;
|
||||||
|
size_t i;
|
||||||
|
uint8 R[HLL_N_REGISTERS];
|
||||||
|
/* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */
|
||||||
|
TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC;
|
||||||
|
|
||||||
|
for (i = 0; i < HLL_N_REGISTERS; i++)
|
||||||
|
{
|
||||||
|
R[i] = getMaximum(cState->regs[i], since);
|
||||||
|
sum += 1.0 / pow(2.0, R[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
|
||||||
|
result = ALPHA_MM / sum;
|
||||||
|
|
||||||
|
if (result <= (5.0 / 2.0) * HLL_N_REGISTERS)
|
||||||
|
{
|
||||||
|
/* Small range correction */
|
||||||
|
int zero_count = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < HLL_N_REGISTERS; i++)
|
||||||
|
{
|
||||||
|
zero_count += R[i] == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (zero_count != 0)
|
||||||
|
result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS /
|
||||||
|
zero_count);
|
||||||
|
}
|
||||||
|
else if (result > (1.0 / 30.0) * POW_2_32)
|
||||||
|
{
|
||||||
|
/* Large range correction */
|
||||||
|
result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
86
pgxn/neon/hll.h
Normal file
86
pgxn/neon/hll.h
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* hll.h
|
||||||
|
* Sliding HyperLogLog cardinality estimator
|
||||||
|
*
|
||||||
|
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
|
||||||
|
*
|
||||||
|
* Implements https://hal.science/hal-00465313/document
|
||||||
|
*
|
||||||
|
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally
|
||||||
|
* suited to estimating the cardinality of very large sets; in particular, we
|
||||||
|
* have not attempted to further optimize the implementation as described in
|
||||||
|
* the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
|
||||||
|
* Engineering of a State of The Art Cardinality Estimation Algorithm".
|
||||||
|
*
|
||||||
|
* A sparse representation of HyperLogLog state is used, with fixed space
|
||||||
|
* overhead.
|
||||||
|
*
|
||||||
|
* The copyright terms of Ohno's original version (the MIT license) follow.
|
||||||
|
*
|
||||||
|
* IDENTIFICATION
|
||||||
|
* src/backend/lib/hyperloglog.c
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the 'Software'), to
|
||||||
|
* deal in the Software without restriction, including without limitation the
|
||||||
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||||||
|
* sell copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||||
|
* IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef HLL_H
|
||||||
|
#define HLL_H
|
||||||
|
|
||||||
|
#define HLL_BIT_WIDTH 10
|
||||||
|
#define HLL_C_BITS (32 - HLL_BIT_WIDTH)
|
||||||
|
#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* HyperLogLog is an approximate technique for computing the number of distinct
|
||||||
|
* entries in a set. Importantly, it does this by using a fixed amount of
|
||||||
|
* memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal
|
||||||
|
* cardinality estimation algorithm" for more.
|
||||||
|
*
|
||||||
|
* Instead of a single counter for every bits register, we have a timestamp
|
||||||
|
* for every valid number of bits we can encounter. Every time we encounter
|
||||||
|
* a certain number of bits, we update the timestamp in those registers to
|
||||||
|
* the current timestamp.
|
||||||
|
*
|
||||||
|
* We can query the sketch's stored cardinality for the range of some timestamp
|
||||||
|
* up to now: For each register, we return the highest bits bucket that has a
|
||||||
|
* modified timestamp >= the query timestamp. This value is the number of bits
|
||||||
|
* for this register in the normal HLL calculation.
|
||||||
|
*
|
||||||
|
* The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
|
||||||
|
* Usage could be halved if we decide to reduce the required time dimension
|
||||||
|
* precision; as 32 bits in second precision should be enough for statistics.
|
||||||
|
* However, that is not yet implemented.
|
||||||
|
*/
|
||||||
|
typedef struct HyperLogLogState
|
||||||
|
{
|
||||||
|
TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
|
||||||
|
} HyperLogLogState;
|
||||||
|
|
||||||
|
extern void initSHLL(HyperLogLogState *cState);
|
||||||
|
extern void addSHLL(HyperLogLogState *cState, uint32 hash);
|
||||||
|
extern double estimateSHLL(HyperLogLogState *cState, time_t dutration);
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -427,12 +427,17 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
|||||||
values[n_pgsql_params] = NULL;
|
values[n_pgsql_params] = NULL;
|
||||||
|
|
||||||
shard->conn = PQconnectStartParams(keywords, values, 1);
|
shard->conn = PQconnectStartParams(keywords, values, 1);
|
||||||
if (!shard->conn)
|
if (PQstatus(shard->conn) == CONNECTION_BAD)
|
||||||
{
|
{
|
||||||
neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
|
char *msg = pchomp(PQerrorMessage(shard->conn));
|
||||||
|
CLEANUP_AND_DISCONNECT(shard);
|
||||||
|
ereport(elevel,
|
||||||
|
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||||
|
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||||
|
errdetail_internal("%s", msg)));
|
||||||
|
pfree(msg);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
shard->state = PS_Connecting_Startup;
|
shard->state = PS_Connecting_Startup;
|
||||||
/* fallthrough */
|
/* fallthrough */
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user