mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-15 12:10:37 +00:00
Compare commits
293 Commits
yuchen/lsn
...
release-pr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
53d53d5b1e | ||
|
|
29fe6ea47a | ||
|
|
640327ccb3 | ||
|
|
7cf0f6b37e | ||
|
|
03c2c569be | ||
|
|
eff6d4538a | ||
|
|
5ef7782e9c | ||
|
|
73101db8c4 | ||
|
|
bccdfc6d39 | ||
|
|
99595813bb | ||
|
|
fe07b54758 | ||
|
|
a42d173e7b | ||
|
|
e07f689238 | ||
|
|
7831eddc88 | ||
|
|
943b1bc80c | ||
|
|
95a184e9b7 | ||
|
|
3fa17e9d17 | ||
|
|
55e0fd9789 | ||
|
|
2a88889f44 | ||
|
|
5bad8126dc | ||
|
|
27bc242085 | ||
|
|
192b49cc6d | ||
|
|
e1b60f3693 | ||
|
|
2804f5323b | ||
|
|
676adc6b32 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
@@ -8,7 +8,6 @@
|
||||
!scripts/combine_control_files.py
|
||||
!scripts/ninstall.sh
|
||||
!vm-cgconfig.conf
|
||||
!docker-compose/run-tests.sh
|
||||
|
||||
# Directories
|
||||
!.cargo/
|
||||
@@ -21,7 +20,7 @@
|
||||
!patches/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!storage_scrubber/
|
||||
!s3_scrubber/
|
||||
!safekeeper/
|
||||
!storage_broker/
|
||||
!storage_controller/
|
||||
|
||||
53
.github/workflows/approved-for-ci-run.yml
vendored
53
.github/workflows/approved-for-ci-run.yml
vendored
@@ -69,41 +69,15 @@ jobs:
|
||||
with:
|
||||
ref: main
|
||||
token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
- name: Look for existing PR
|
||||
id: get-pr
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
|
||||
echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
|
||||
|
||||
- name: Get changed labels
|
||||
id: get-labels
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED != ''
|
||||
env:
|
||||
ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
LABELS_TO_REMOVE=$(comm -23 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) | sort) \
|
||||
<(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\
|
||||
( grep -v run-e2e-tests-in-draft || true ) | paste -sd , -)
|
||||
LABELS_TO_ADD=$(comm -13 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) |sort) \
|
||||
<(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\
|
||||
paste -sd , -)
|
||||
echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
|
||||
echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
|
||||
|
||||
- run: gh pr checkout "${PR_NUMBER}"
|
||||
|
||||
- run: git checkout -b "${BRANCH}"
|
||||
|
||||
- run: git push --force origin "${BRANCH}"
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED == ''
|
||||
|
||||
- name: Create a Pull Request for CI run (if required)
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED == ''
|
||||
env:
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
@@ -114,33 +88,16 @@ jobs:
|
||||
Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
|
||||
EOF
|
||||
|
||||
LABELS=$( (gh pr --repo "${GITHUB_REPOSITORY}" view ${PR_NUMBER} --json labels --jq '.labels.[].name'; echo run-e2e-tests-in-draft )| \
|
||||
grep -E '^run' | paste -sd , -)
|
||||
gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
|
||||
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
|
||||
if [ -z "${ALREADY_CREATED}" ]; then
|
||||
gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
|
||||
--body-file "body.md" \
|
||||
--head "${BRANCH}" \
|
||||
--base "main" \
|
||||
--label ${LABELS} \
|
||||
--label "run-e2e-tests-in-draft" \
|
||||
--draft
|
||||
- name: Modify the existing pull request (if required)
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED != ''
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
LABELS_TO_ADD: ${{ steps.get-labels.outputs.LABELS_TO_ADD }}
|
||||
LABELS_TO_REMOVE: ${{ steps.get-labels.outputs.LABELS_TO_REMOVE }}
|
||||
ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
|
||||
run: |
|
||||
ADD_CMD=
|
||||
REMOVE_CMD=
|
||||
[ -z "${LABELS_TO_ADD}" ] || ADD_CMD="--add-label ${LABELS_TO_ADD}"
|
||||
[ -z "${LABELS_TO_REMOVE}" ] || REMOVE_CMD="--remove-label ${LABELS_TO_REMOVE}"
|
||||
if [ -n "${ADD_CMD}" ] || [ -n "${REMOVE_CMD}" ]; then
|
||||
gh pr --repo "${GITHUB_REPOSITORY}" edit ${ALREADY_CREATED} ${ADD_CMD} ${REMOVE_CMD}
|
||||
fi
|
||||
|
||||
- run: git push --force origin "${BRANCH}"
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED != ''
|
||||
|
||||
cleanup:
|
||||
# Close PRs and delete branchs if the original PR is closed.
|
||||
|
||||
|
||||
8
.github/workflows/benchmarking.yml
vendored
8
.github/workflows/benchmarking.yml
vendored
@@ -99,7 +99,7 @@ jobs:
|
||||
# Set --sparse-ordering option of pytest-order plugin
|
||||
# to ensure tests are running in order of appears in the file.
|
||||
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
|
||||
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
|
||||
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -410,14 +410,14 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Benchmark pgvector queries
|
||||
- name: Benchmark pgvector hnsw queries
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_perf_pgvector_queries.py
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
|
||||
@@ -30,6 +30,7 @@ jobs:
|
||||
check-image:
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
# This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
|
||||
build-image:
|
||||
needs: [ check-image ]
|
||||
if: needs.check-image.outputs.found == 'false'
|
||||
@@ -54,7 +55,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
|
||||
89
.github/workflows/build_and_test.yml
vendored
89
.github/workflows/build_and_test.yml
vendored
@@ -337,8 +337,34 @@ jobs:
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
# Do install *before* running rust tests because they might recompile the
|
||||
# binaries with different features/flags.
|
||||
- name: Run rust tests
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
#nextest does not yet support running doctests
|
||||
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
# Install target binaries
|
||||
@@ -379,32 +405,6 @@ jobs:
|
||||
done
|
||||
fi
|
||||
|
||||
- name: Run rust tests
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
#nextest does not yet support running doctests
|
||||
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
|
||||
- name: Install postgres binaries
|
||||
run: cp -a pg_install /tmp/neon/pg_install
|
||||
|
||||
@@ -859,26 +859,6 @@ jobs:
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Build neon extensions test image
|
||||
if: matrix.version == 'v16'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
target: neon-pg-ext-test
|
||||
cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
if: matrix.version == 'v16'
|
||||
@@ -922,13 +902,6 @@ jobs:
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- name: Create multi-arch neon-test-extensions image
|
||||
if: matrix.version == 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- name: Create multi-arch compute-tools image
|
||||
if: matrix.version == 'v16'
|
||||
run: |
|
||||
@@ -965,7 +938,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v1
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -1047,7 +1020,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify docker-compose example and test extensions
|
||||
- name: Verify docker-compose example
|
||||
timeout-minutes: 20
|
||||
run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
|
||||
|
||||
@@ -1101,8 +1074,6 @@ jobs:
|
||||
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
done
|
||||
docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
|
||||
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
trigger-custom-extensions-build-and-wait:
|
||||
needs: [ check-permissions, tag ]
|
||||
|
||||
23
.github/workflows/check-build-tools-image.yml
vendored
23
.github/workflows/check-build-tools-image.yml
vendored
@@ -25,17 +25,26 @@ jobs:
|
||||
found: ${{ steps.check-image.outputs.found }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Get build-tools image tag for the current commit
|
||||
id: get-build-tools-tag
|
||||
env:
|
||||
IMAGE_TAG: |
|
||||
${{ hashFiles('Dockerfile.build-tools',
|
||||
'.github/workflows/check-build-tools-image.yml',
|
||||
'.github/workflows/build-build-tools-image.yml') }}
|
||||
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
|
||||
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT
|
||||
LAST_BUILD_TOOLS_SHA=$(
|
||||
gh api \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
--method GET \
|
||||
--field path=Dockerfile.build-tools \
|
||||
--field sha=${COMMIT_SHA} \
|
||||
--field per_page=1 \
|
||||
--jq ".[0].sha" \
|
||||
"/repos/${GITHUB_REPOSITORY}/commits"
|
||||
)
|
||||
echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- name: Check if such tag found in the registry
|
||||
id: check-image
|
||||
|
||||
97
Cargo.lock
generated
97
Cargo.lock
generated
@@ -5109,6 +5109,54 @@ version = "1.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
|
||||
|
||||
[[package]]
|
||||
name = "s3_scrubber"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"aws-config",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-async",
|
||||
"bincode",
|
||||
"bytes",
|
||||
"camino",
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"either",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"hex",
|
||||
"histogram",
|
||||
"humantime",
|
||||
"itertools",
|
||||
"once_cell",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest 0.12.4",
|
||||
"rustls 0.22.4",
|
||||
"rustls-native-certs 0.7.0",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "safekeeper"
|
||||
version = "0.1.0"
|
||||
@@ -5765,54 +5813,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storage_scrubber"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"aws-config",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-async",
|
||||
"bincode",
|
||||
"bytes",
|
||||
"camino",
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"either",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"hex",
|
||||
"histogram",
|
||||
"humantime",
|
||||
"itertools",
|
||||
"once_cell",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest 0.12.4",
|
||||
"rustls 0.22.4",
|
||||
"rustls-native-certs 0.7.0",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storcon_cli"
|
||||
version = "0.1.0"
|
||||
@@ -5820,7 +5820,6 @@ dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"futures",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"pageserver_api",
|
||||
|
||||
@@ -13,7 +13,7 @@ members = [
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
"storage_controller",
|
||||
"storage_scrubber",
|
||||
"s3_scrubber",
|
||||
"workspace_hack",
|
||||
"trace",
|
||||
"libs/compute_api",
|
||||
@@ -120,7 +120,7 @@ num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.20.0"
|
||||
opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.12.0"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
||||
@@ -128,7 +128,7 @@ parquet_derive = "51.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
procfs = "0.14"
|
||||
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
@@ -184,7 +184,7 @@ tower-service = "0.3.2"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.21.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
url = "2.2"
|
||||
urlencoding = "2.1"
|
||||
|
||||
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.79.0
|
||||
ENV RUSTC_VERSION=1.78.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
|
||||
@@ -246,8 +246,8 @@ COPY patches/pgvector.patch /pgvector.patch
|
||||
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||
# because we build the images on different machines than where we run them.
|
||||
# Pass OPTFLAGS="" to remove it.
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
|
||||
echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "fe6c8cb4e0cd1a8cb60f5badf9e1701e0fcabcfc260931c26d01e155c4dd21d1 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
@@ -928,69 +928,6 @@ RUN rm -r /usr/local/pgsql/include
|
||||
# if they were to be used by other libraries.
|
||||
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer neon-pg-ext-test
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM neon-pg-ext-build AS neon-pg-ext-test
|
||||
ARG PG_VERSION
|
||||
RUN mkdir /ext-src
|
||||
|
||||
#COPY --from=postgis-build /postgis.tar.gz /ext-src/
|
||||
#COPY --from=postgis-build /sfcgal/* /usr
|
||||
COPY --from=plv8-build /plv8.tar.gz /ext-src/
|
||||
COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
|
||||
COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.patch /ext-src/
|
||||
COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
|
||||
#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
|
||||
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
|
||||
COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
|
||||
COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
|
||||
#COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
|
||||
COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
|
||||
COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
|
||||
COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||
COPY patches/pg_hintplan.patch /ext-src
|
||||
#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
|
||||
COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
|
||||
COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
|
||||
#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
|
||||
#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
|
||||
COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
|
||||
COPY patches/pg_anon.patch /ext-src
|
||||
COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
|
||||
COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
|
||||
RUN cd /ext-src/ && for f in *.tar.gz; \
|
||||
do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
|
||||
rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
|
||||
|| exit 1; rm -f $f; done
|
||||
RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
# cmake is required for the h3 test
|
||||
RUN apt-get update && apt-get install -y cmake
|
||||
RUN patch -p1 < /ext-src/pg_hintplan.patch
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN patch -p1 </ext-src/pg_anon.patch
|
||||
RUN patch -p1 </ext-src/pg_cron.patch
|
||||
ENV PATH=/usr/local/pgsql/bin:$PATH
|
||||
ENV PGHOST=compute
|
||||
ENV PGPORT=55433
|
||||
ENV PGUSER=cloud_admin
|
||||
ENV PGDATABASE=postgres
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
|
||||
2
Makefile
2
Makefile
@@ -124,8 +124,6 @@ postgres-%: postgres-configure-% \
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
|
||||
+@echo "Compiling amcheck $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
|
||||
+@echo "Compiling test_decoding $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install
|
||||
|
||||
.PHONY: postgres-clean-%
|
||||
postgres-clean-%:
|
||||
|
||||
@@ -735,7 +735,7 @@ fn cli() -> clap::Command {
|
||||
Arg::new("filecache-connstr")
|
||||
.long("filecache-connstr")
|
||||
.default_value(
|
||||
"host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
|
||||
"host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
ALTER ROLE neon_superuser BYPASSRLS;
|
||||
@@ -1,18 +0,0 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
|
||||
END LOOP;
|
||||
|
||||
FOR role_name IN SELECT rolname FROM pg_roles
|
||||
WHERE
|
||||
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
|
||||
END LOOP;
|
||||
END $$;
|
||||
@@ -1,6 +0,0 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
|
||||
END IF;
|
||||
END $$;
|
||||
@@ -1 +0,0 @@
|
||||
GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;
|
||||
@@ -1,4 +0,0 @@
|
||||
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
|
||||
-- interacted with by neon_superuser without permission issues.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser;
|
||||
@@ -1,4 +0,0 @@
|
||||
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
|
||||
-- interacted with by neon_superuser without permission issues.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser;
|
||||
@@ -1,3 +0,0 @@
|
||||
-- SKIP: Moved inline to the handle_grants() functions.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
|
||||
@@ -1,3 +0,0 @@
|
||||
-- SKIP: Moved inline to the handle_grants() functions.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;
|
||||
@@ -1,13 +0,0 @@
|
||||
-- SKIP: The original goal of this migration was to prevent creating
|
||||
-- subscriptions, but this migration was insufficient.
|
||||
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
END LOOP;
|
||||
END $$;
|
||||
@@ -774,21 +774,44 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
// !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
|
||||
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
// Add new migrations in numerical order.
|
||||
let migrations = [
|
||||
include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
|
||||
include_str!("./migrations/0001-alter_roles.sql"),
|
||||
include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
|
||||
include_str!(
|
||||
"./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!(
|
||||
"./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
|
||||
"ALTER ROLE neon_superuser BYPASSRLS",
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
|
||||
END LOOP;
|
||||
|
||||
FOR role_name IN SELECT rolname FROM pg_roles
|
||||
WHERE
|
||||
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
|
||||
END LOOP;
|
||||
END $$;
|
||||
"#,
|
||||
r#"
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
"GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
|
||||
// Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
];
|
||||
|
||||
let mut func = || {
|
||||
@@ -824,13 +847,10 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
let migration = &migrations[current_migration];
|
||||
if migration.starts_with("-- SKIP") {
|
||||
info!("Skipping migration id={}", current_migration);
|
||||
if migration.is_empty() {
|
||||
info!("Skip migration id={}", current_migration);
|
||||
} else {
|
||||
info!(
|
||||
"Running migration id={}:\n{}\n",
|
||||
current_migration, migration
|
||||
);
|
||||
info!("Running migration:\n{}\n", migration);
|
||||
client.simple_query(migration).with_context(|| {
|
||||
format!("handle_migrations current_migration={}", current_migration)
|
||||
})?;
|
||||
|
||||
@@ -9,7 +9,6 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use futures::StreamExt;
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
@@ -149,22 +148,6 @@ enum Command {
|
||||
#[arg(long)]
|
||||
threshold: humantime::Duration,
|
||||
},
|
||||
// Drain a set of specified pageservers by moving the primary attachments to pageservers
|
||||
// outside of the specified set.
|
||||
Drain {
|
||||
// Set of pageserver node ids to drain.
|
||||
#[arg(long)]
|
||||
nodes: Vec<NodeId>,
|
||||
// Optional: migration concurrency (default is 8)
|
||||
#[arg(long)]
|
||||
concurrency: Option<usize>,
|
||||
// Optional: maximum number of shards to migrate
|
||||
#[arg(long)]
|
||||
max_shards: Option<usize>,
|
||||
// Optional: when set to true, nothing is migrated, but the plan is printed to stdout
|
||||
#[arg(long)]
|
||||
dry_run: Option<bool>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -754,194 +737,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::Drain {
|
||||
nodes,
|
||||
concurrency,
|
||||
max_shards,
|
||||
dry_run,
|
||||
} => {
|
||||
// Load the list of nodes, split them up into the drained and filled sets,
|
||||
// and validate that draining is possible.
|
||||
let node_descs = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut node_to_drain_descs = Vec::new();
|
||||
let mut node_to_fill_descs = Vec::new();
|
||||
|
||||
for desc in node_descs {
|
||||
let to_drain = nodes.iter().any(|id| *id == desc.id);
|
||||
if to_drain {
|
||||
node_to_drain_descs.push(desc);
|
||||
} else {
|
||||
node_to_fill_descs.push(desc);
|
||||
}
|
||||
}
|
||||
|
||||
if nodes.len() != node_to_drain_descs.len() {
|
||||
anyhow::bail!("Drain requested for node which doesn't exist.")
|
||||
}
|
||||
|
||||
node_to_fill_descs.retain(|desc| {
|
||||
matches!(desc.availability, NodeAvailabilityWrapper::Active)
|
||||
&& matches!(
|
||||
desc.scheduling,
|
||||
NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling
|
||||
)
|
||||
});
|
||||
|
||||
if node_to_fill_descs.is_empty() {
|
||||
anyhow::bail!("There are no nodes to drain to")
|
||||
}
|
||||
|
||||
// Set the node scheduling policy to draining for the nodes which
|
||||
// we plan to drain.
|
||||
for node_desc in node_to_drain_descs.iter() {
|
||||
let req = NodeConfigureRequest {
|
||||
node_id: node_desc.id,
|
||||
availability: None,
|
||||
scheduling: Some(NodeSchedulingPolicy::Draining),
|
||||
};
|
||||
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{}/config", node_desc.id),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Perform the drain: move each tenant shard scheduled on a node to
|
||||
// be drained to a node which is being filled. A simple round robin
|
||||
// strategy is used to pick the new node.
|
||||
let tenants = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/tenant".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut selected_node_idx = 0;
|
||||
|
||||
struct DrainMove {
|
||||
tenant_shard_id: TenantShardId,
|
||||
from: NodeId,
|
||||
to: NodeId,
|
||||
}
|
||||
|
||||
let mut moves: Vec<DrainMove> = Vec::new();
|
||||
|
||||
let shards = tenants
|
||||
.into_iter()
|
||||
.flat_map(|tenant| tenant.shards.into_iter());
|
||||
for shard in shards {
|
||||
if let Some(max_shards) = max_shards {
|
||||
if moves.len() >= max_shards {
|
||||
println!(
|
||||
"Stop planning shard moves since the requested maximum was reached"
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let should_migrate = {
|
||||
if let Some(attached_to) = shard.node_attached {
|
||||
node_to_drain_descs
|
||||
.iter()
|
||||
.map(|desc| desc.id)
|
||||
.any(|id| id == attached_to)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
};
|
||||
|
||||
if !should_migrate {
|
||||
continue;
|
||||
}
|
||||
|
||||
moves.push(DrainMove {
|
||||
tenant_shard_id: shard.tenant_shard_id,
|
||||
from: shard
|
||||
.node_attached
|
||||
.expect("We only migrate attached tenant shards"),
|
||||
to: node_to_fill_descs[selected_node_idx].id,
|
||||
});
|
||||
selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len();
|
||||
}
|
||||
|
||||
let total_moves = moves.len();
|
||||
|
||||
if dry_run == Some(true) {
|
||||
println!("Dryrun requested. Planned {total_moves} moves:");
|
||||
for mv in &moves {
|
||||
println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to)
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
const DEFAULT_MIGRATE_CONCURRENCY: usize = 8;
|
||||
let mut stream = futures::stream::iter(moves)
|
||||
.map(|mv| {
|
||||
let client = Client::new(cli.api.clone(), cli.jwt.clone());
|
||||
async move {
|
||||
client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id: mv.tenant_shard_id,
|
||||
node_id: mv.to,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
|
||||
}
|
||||
})
|
||||
.buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY));
|
||||
|
||||
let mut success = 0;
|
||||
let mut failure = 0;
|
||||
|
||||
while let Some(res) = stream.next().await {
|
||||
match res {
|
||||
Ok(_) => {
|
||||
success += 1;
|
||||
}
|
||||
Err((tenant_shard_id, from, to, error)) => {
|
||||
failure += 1;
|
||||
println!(
|
||||
"Failed to migrate {} from node {} to node {}: {}",
|
||||
tenant_shard_id, from, to, error
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (success + failure) % 20 == 0 {
|
||||
println!(
|
||||
"Processed {}/{} shards: {} succeeded, {} failed",
|
||||
success + failure,
|
||||
total_moves,
|
||||
success,
|
||||
failure
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
println!(
|
||||
"Processed {}/{} shards: {} succeeded, {} failed",
|
||||
success + failure,
|
||||
total_moves,
|
||||
success,
|
||||
failure
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -8,11 +8,6 @@ USER root
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl \
|
||||
jq \
|
||||
python3-pip \
|
||||
netcat
|
||||
#Faker is required for the pg_anon test
|
||||
RUN pip3 install Faker
|
||||
#This is required for the pg_hintplan test
|
||||
RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src
|
||||
|
||||
USER postgres
|
||||
USER postgres
|
||||
|
||||
@@ -95,7 +95,7 @@
|
||||
},
|
||||
{
|
||||
"name": "shared_preload_libraries",
|
||||
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
|
||||
"value": "neon",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
@@ -127,16 +127,6 @@
|
||||
"name": "max_replication_flush_lag",
|
||||
"value": "10GB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "cron.database",
|
||||
"value": "postgres",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "session_preload_libraries",
|
||||
"value": "anon",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
minio:
|
||||
restart: always
|
||||
@@ -159,12 +161,12 @@ services:
|
||||
context: ./compute_wrapper/
|
||||
args:
|
||||
- REPOSITORY=${REPOSITORY:-neondatabase}
|
||||
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
|
||||
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
|
||||
- TAG=${TAG:-latest}
|
||||
- http_proxy=$http_proxy
|
||||
- https_proxy=$https_proxy
|
||||
environment:
|
||||
- PG_VERSION=${PG_VERSION:-16}
|
||||
- PG_VERSION=${PG_VERSION:-14}
|
||||
#- RUST_BACKTRACE=1
|
||||
# Mount the test files directly, for faster editing cycle.
|
||||
volumes:
|
||||
@@ -192,14 +194,3 @@ services:
|
||||
done"
|
||||
depends_on:
|
||||
- compute
|
||||
|
||||
neon-test-extensions:
|
||||
profiles: ["test-extensions"]
|
||||
image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
|
||||
entrypoint:
|
||||
- "/bin/bash"
|
||||
- "-c"
|
||||
command:
|
||||
- sleep 1800
|
||||
depends_on:
|
||||
- compute
|
||||
|
||||
@@ -7,94 +7,52 @@
|
||||
# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
|
||||
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
|
||||
# to verify custom image builds (e.g pre-published ones).
|
||||
#
|
||||
# A test script for postgres extensions
|
||||
# Currently supports only v16
|
||||
#
|
||||
|
||||
set -eux -o pipefail
|
||||
|
||||
COMPOSE_FILE='docker-compose.yml'
|
||||
cd $(dirname $0)
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
|
||||
|
||||
COMPUTE_CONTAINER_NAME=docker-compose-compute-1
|
||||
TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
|
||||
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
|
||||
: ${http_proxy:=}
|
||||
: ${https_proxy:=}
|
||||
export http_proxy https_proxy
|
||||
SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
|
||||
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
|
||||
|
||||
cleanup() {
|
||||
echo "show container information"
|
||||
docker ps
|
||||
docker compose --profile test-extensions -f $COMPOSE_FILE logs
|
||||
docker compose -f $COMPOSE_FILE logs
|
||||
echo "stop containers..."
|
||||
docker compose --profile test-extensions -f $COMPOSE_FILE down
|
||||
docker compose -f $COMPOSE_FILE down
|
||||
}
|
||||
|
||||
echo "clean up containers if exists"
|
||||
cleanup
|
||||
|
||||
for pg_version in 14 15 16; do
|
||||
echo "clean up containers if exists"
|
||||
cleanup
|
||||
PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
|
||||
PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
|
||||
echo "start containers (pg_version=$pg_version)."
|
||||
PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
|
||||
|
||||
echo "wait until the compute is ready. timeout after 60s. "
|
||||
cnt=0
|
||||
while sleep 3; do
|
||||
while sleep 1; do
|
||||
# check timeout
|
||||
cnt=`expr $cnt + 3`
|
||||
cnt=`expr $cnt + 1`
|
||||
if [ $cnt -gt 60 ]; then
|
||||
echo "timeout before the compute is ready."
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
|
||||
|
||||
# check if the compute is ready
|
||||
set +o pipefail
|
||||
result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
|
||||
set -o pipefail
|
||||
if [ $result -eq 1 ]; then
|
||||
echo "OK. The compute is ready to connect."
|
||||
echo "execute simple queries."
|
||||
docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
|
||||
cleanup
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $pg_version -ge 16 ]
|
||||
then
|
||||
echo Enabling trust connection
|
||||
docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
|
||||
echo Adding postgres role
|
||||
docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
|
||||
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config
|
||||
docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
|
||||
# This block is required for the pg_anon extension test.
|
||||
# The test assumes that it is running on the same host with the postgres engine.
|
||||
# In our case it's not true, that's why we are copying files to the compute node
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
|
||||
echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
|
||||
rm -rf $TMPDIR
|
||||
TMPDIR=$(mktemp -d)
|
||||
# The following block does the same for the pg_hintplan test
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
|
||||
rm -rf $TMPDIR
|
||||
# We are running tests now
|
||||
if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
$TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
|
||||
then
|
||||
cleanup
|
||||
else
|
||||
FAILED=$(tail -1 testout.txt)
|
||||
for d in $FAILED
|
||||
do
|
||||
mkdir $d
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
|
||||
cat $d/regression.out $d/regression.diffs || true
|
||||
done
|
||||
rm -rf $FAILED
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
cleanup
|
||||
done
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
cd /ext-src
|
||||
FAILED=
|
||||
LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
|
||||
for d in ${LIST}
|
||||
do
|
||||
[ -d ${d} ] || continue
|
||||
psql -c "select 1" >/dev/null || break
|
||||
make -C ${d} installcheck || FAILED="${d} ${FAILED}"
|
||||
done
|
||||
[ -z "${FAILED}" ] && exit 0
|
||||
echo ${FAILED}
|
||||
exit 1
|
||||
@@ -4,18 +4,18 @@
|
||||
|
||||
Currently we build two main images:
|
||||
|
||||
- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14.
|
||||
- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).
|
||||
|
||||
And additional intermediate image:
|
||||
|
||||
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
|
||||
|
||||
## Build pipeline
|
||||
## Building pipeline
|
||||
|
||||
We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
|
||||
|
||||
1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
|
||||
1. `neondatabase/compute-tools` and `neondatabase/compute-node`
|
||||
|
||||
2. `neondatabase/neon`
|
||||
|
||||
@@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea
|
||||
1. create containers
|
||||
|
||||
You can specify version of neon cluster using following environment values.
|
||||
- PG_VERSION: postgres version for compute (default is 16 as of this writing)
|
||||
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags), which is tagged in [CI test](/.github/workflows/build_and_test.yml). Default is 'latest'
|
||||
- PG_VERSION: postgres version for compute (default is 14)
|
||||
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
|
||||
```
|
||||
$ cd docker-compose/
|
||||
$ docker-compose down # remove the containers if exists
|
||||
$ PG_VERSION=16 TAG=latest docker-compose up --build -d # You can specify the postgres and image version
|
||||
$ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version
|
||||
Creating network "dockercompose_default" with the default driver
|
||||
Creating docker-compose_storage_broker_1 ... done
|
||||
(...omit...)
|
||||
@@ -47,31 +47,29 @@ Creating docker-compose_storage_broker_1 ... done
|
||||
|
||||
2. connect compute node
|
||||
```
|
||||
$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres
|
||||
psql (16.3)
|
||||
Type "help" for help.
|
||||
|
||||
$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
|
||||
$ chmod 600 ~/.pgpass
|
||||
$ psql -h localhost -p 55433 -U cloud_admin
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
postgres=# insert into t values(1, 1);
|
||||
postgres=# insert into t values(1,1);
|
||||
INSERT 0 1
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
|
||||
```
|
||||
|
||||
3. If you want to see the log, you can use `docker-compose logs` command.
|
||||
```
|
||||
# check the container name you want to see
|
||||
$ docker ps
|
||||
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
|
||||
3582f6d76227 docker-compose_compute "/shell/compute.sh" 2 minutes ago Up 2 minutes 0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp docker-compose_compute_1
|
||||
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
|
||||
d6968a5ae912 dockercompose_compute "/shell/compute.sh" 5 minutes ago Up 5 minutes 0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp dockercompose_compute_1
|
||||
(...omit...)
|
||||
|
||||
$ docker logs -f docker-compose_compute_1
|
||||
$ docker logs -f dockercompose_compute_1
|
||||
2022-10-21 06:15:48.757 GMT [56] LOG: connection authorized: user=cloud_admin database=postgres application_name=psql
|
||||
2022-10-21 06:17:00.307 GMT [56] LOG: [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
|
||||
(...omit...)
|
||||
|
||||
@@ -558,12 +558,6 @@ impl KeySpaceRandomAccum {
|
||||
self.ranges.push(range);
|
||||
}
|
||||
|
||||
pub fn add_keyspace(&mut self, keyspace: KeySpace) {
|
||||
for range in keyspace.ranges {
|
||||
self.add_range(range);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_keyspace(mut self) -> KeySpace {
|
||||
let mut ranges = Vec::new();
|
||||
if !self.ranges.is_empty() {
|
||||
|
||||
@@ -177,15 +177,6 @@ serde_with::serde_conv!(
|
||||
|value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
|
||||
);
|
||||
|
||||
impl LsnLease {
|
||||
pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);
|
||||
|
||||
/// Checks whether the lease is expired.
|
||||
pub fn is_expired(&self, now: &SystemTime) -> bool {
|
||||
now > &self.valid_until
|
||||
}
|
||||
}
|
||||
|
||||
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum ActivatingFrom {
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fmt::Display;
|
||||
use std::io;
|
||||
use std::num::NonZeroU32;
|
||||
use std::pin::Pin;
|
||||
@@ -30,7 +29,6 @@ use http_types::{StatusCode, Url};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
use utils::backoff;
|
||||
|
||||
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
|
||||
use crate::{
|
||||
@@ -453,58 +451,26 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
// TODO batch requests are not supported by the SDK
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
|
||||
for path in paths {
|
||||
#[derive(Debug)]
|
||||
enum AzureOrTimeout {
|
||||
AzureError(azure_core::Error),
|
||||
Timeout,
|
||||
Cancel,
|
||||
}
|
||||
impl Display for AzureOrTimeout {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{self:?}")
|
||||
}
|
||||
}
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 5;
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
|
||||
let request = blob_client.delete().into_future();
|
||||
let request = blob_client.delete().into_future();
|
||||
|
||||
let res = tokio::time::timeout(self.timeout, request).await;
|
||||
let res = tokio::time::timeout(self.timeout, request).await;
|
||||
|
||||
match res {
|
||||
Ok(Ok(_v)) => Ok(()),
|
||||
Ok(Err(azure_err)) => {
|
||||
if let Some(http_err) = azure_err.as_http_error() {
|
||||
if http_err.status() == StatusCode::NotFound {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
Err(AzureOrTimeout::AzureError(azure_err))
|
||||
match res {
|
||||
Ok(Ok(_response)) => continue,
|
||||
Ok(Err(e)) => {
|
||||
if let Some(http_err) = e.as_http_error() {
|
||||
if http_err.status() == StatusCode::NotFound {
|
||||
continue;
|
||||
}
|
||||
Err(_elapsed) => Err(AzureOrTimeout::Timeout),
|
||||
}
|
||||
},
|
||||
|err| match err {
|
||||
AzureOrTimeout::AzureError(_) | AzureOrTimeout::Timeout => false,
|
||||
AzureOrTimeout::Cancel => true,
|
||||
},
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"deleting remote object",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| AzureOrTimeout::Cancel)
|
||||
.and_then(|x| x)
|
||||
.map_err(|e| match e {
|
||||
AzureOrTimeout::AzureError(err) => anyhow::Error::from(err),
|
||||
AzureOrTimeout::Timeout => TimeoutOrCancel::Timeout.into(),
|
||||
AzureOrTimeout::Cancel => TimeoutOrCancel::Cancel.into(),
|
||||
})?;
|
||||
return Err(e.into());
|
||||
}
|
||||
Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
};
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ license.workspace = true
|
||||
[dependencies]
|
||||
hyper.workspace = true
|
||||
opentelemetry = { workspace = true, features=["rt-tokio"] }
|
||||
opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions.workspace = true
|
||||
reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
|
||||
@@ -34,9 +34,6 @@ pub enum ApiError {
|
||||
#[error("Timeout")]
|
||||
Timeout(Cow<'static, str>),
|
||||
|
||||
#[error("Request cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(anyhow::Error),
|
||||
}
|
||||
@@ -77,10 +74,6 @@ impl ApiError {
|
||||
err.to_string(),
|
||||
StatusCode::REQUEST_TIMEOUT,
|
||||
),
|
||||
ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
|
||||
self.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
@@ -140,7 +133,6 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
|
||||
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
|
||||
ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
|
||||
ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
|
||||
ApiError::Cancelled => info!("Request cancelled while processing HTTP request"),
|
||||
_ => info!("Error processing HTTP request: {api_error:#}"),
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
pub(crate) enum IndexPartCmd {
|
||||
@@ -12,7 +17,20 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
IndexPartCmd::Dump { path } => {
|
||||
let bytes = tokio::fs::read(path).await.context("read file")?;
|
||||
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
|
||||
let output = serde_json::to_string_pretty(&des).context("serialize output")?;
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output<'a> {
|
||||
layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
timeline_metadata: &'a TimelineMetadata,
|
||||
}
|
||||
|
||||
let output = Output {
|
||||
layer_metadata: &des.layer_metadata,
|
||||
disk_consistent_lsn: des.metadata.disk_consistent_lsn(),
|
||||
timeline_metadata: &des.metadata,
|
||||
};
|
||||
|
||||
let output = serde_json::to_string_pretty(&output).context("serialize output")?;
|
||||
println!("{output}");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
//! and push them to a HTTP endpoint.
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::size::CalculateSyntheticSizeError;
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
|
||||
use crate::tenant::{
|
||||
mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
|
||||
};
|
||||
use camino::Utf8PathBuf;
|
||||
use consumption_metrics::EventType;
|
||||
use pageserver_api::models::TenantState;
|
||||
@@ -349,12 +350,19 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
|
||||
// Same for the loop that fetches computed metrics.
|
||||
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
|
||||
// which turns out is really handy to understand the system.
|
||||
match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await {
|
||||
Ok(_) => {}
|
||||
Err(CalculateSyntheticSizeError::Cancelled) => {}
|
||||
Err(e) => {
|
||||
let tenant_shard_id = tenant.tenant_shard_id();
|
||||
error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
|
||||
}
|
||||
let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
|
||||
return;
|
||||
};
|
||||
|
||||
// this error can be returned if timeline is shutting down, but it does not
|
||||
// mean the synthetic size worker should terminate.
|
||||
let shutting_down = matches!(
|
||||
e.downcast_ref::<PageReconstructError>(),
|
||||
Some(PageReconstructError::Cancelled)
|
||||
);
|
||||
|
||||
if !shutting_down {
|
||||
let tenant_shard_id = tenant.tenant_shard_id();
|
||||
error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,6 @@ use pageserver_api::models::IngestAuxFilesRequest;
|
||||
use pageserver_api::models::ListAuxFilesRequest;
|
||||
use pageserver_api::models::LocationConfig;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantLocationConfigResponse;
|
||||
@@ -182,7 +181,9 @@ impl From<PageReconstructError> for ApiError {
|
||||
PageReconstructError::MissingKey(e) => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("{e}"))
|
||||
}
|
||||
PageReconstructError::Cancelled => ApiError::Cancelled,
|
||||
PageReconstructError::Cancelled => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||
}
|
||||
@@ -1136,10 +1137,7 @@ async fn tenant_size_handler(
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown,
|
||||
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut sizes = None;
|
||||
let accepts_html = headers
|
||||
@@ -1147,7 +1145,9 @@ async fn tenant_size_handler(
|
||||
.map(|v| v == "text/html")
|
||||
.unwrap_or_default();
|
||||
if !inputs_only.unwrap_or(false) {
|
||||
let storage_model = inputs.calculate_model();
|
||||
let storage_model = inputs
|
||||
.calculate_model()
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let size = storage_model.calculate();
|
||||
|
||||
// If request header expects html, return html
|
||||
@@ -1731,7 +1731,7 @@ async fn lsn_lease_handler(
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let result = timeline
|
||||
.make_lsn_lease(lsn, LsnLease::DEFAULT_LENGTH, &ctx)
|
||||
.make_lsn_lease(lsn, &ctx)
|
||||
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
@@ -2431,25 +2431,6 @@ async fn list_aux_files(
|
||||
json_response(StatusCode::OK, files)
|
||||
}
|
||||
|
||||
async fn perf_info(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let result = timeline.perf_info().await;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
async fn ingest_aux_files(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -2877,9 +2858,5 @@ pub fn make_router(
|
||||
|r| testing_api_handler("list_aux_files", r, list_aux_files),
|
||||
)
|
||||
.post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
|
||||
|r| testing_api_handler("perf_info", r, perf_info),
|
||||
)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -2108,7 +2108,6 @@ pub(crate) struct TimelineMetrics {
|
||||
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
|
||||
pub evictions: IntCounter,
|
||||
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
|
||||
shutdown: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
impl TimelineMetrics {
|
||||
@@ -2228,7 +2227,6 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
shutdown: std::sync::atomic::AtomicBool::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2251,17 +2249,6 @@ impl TimelineMetrics {
|
||||
}
|
||||
|
||||
pub(crate) fn shutdown(&self) {
|
||||
let was_shutdown = self
|
||||
.shutdown
|
||||
.swap(true, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
if was_shutdown {
|
||||
// this happens on tenant deletion because tenant first shuts down timelines, then
|
||||
// invokes timeline deletion which first shuts down the timeline again.
|
||||
// TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
|
||||
return;
|
||||
}
|
||||
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
|
||||
@@ -9,7 +9,6 @@ use futures::stream::FuturesUnordered;
|
||||
use futures::Stream;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{
|
||||
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
||||
@@ -936,7 +935,7 @@ impl PageServerHandler {
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
|
||||
.await?;
|
||||
let lease = timeline.make_lsn_lease(lsn, LsnLease::DEFAULT_LENGTH, ctx)?;
|
||||
let lease = timeline.make_lsn_lease(lsn, ctx)?;
|
||||
let valid_until = lease
|
||||
.valid_until
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
|
||||
@@ -36,7 +36,6 @@ use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
@@ -410,8 +409,6 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||
pausable_failpoint!("find-lsn-for-timestamp-pausable");
|
||||
|
||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||
// We use this method to figure out the branching LSN for the new branch, but the
|
||||
// GC cutoff could be before the branching point and we cannot create a new branch
|
||||
@@ -427,7 +424,6 @@ impl Timeline {
|
||||
|
||||
let mut found_smaller = false;
|
||||
let mut found_larger = false;
|
||||
|
||||
while low < high {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(PageReconstructError::Cancelled);
|
||||
@@ -919,14 +915,6 @@ impl Timeline {
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
{
|
||||
let guard = self.extra_test_dense_keyspace.load();
|
||||
for kr in &guard.ranges {
|
||||
result.add_range(kr.clone());
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
result.to_keyspace(),
|
||||
/* AUX sparse key space */
|
||||
|
||||
@@ -240,7 +240,6 @@ pub struct GcResult {
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
pub layers_needed_by_pitr: u64,
|
||||
pub layers_needed_by_branches: u64,
|
||||
pub layers_needed_by_leases: u64,
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
|
||||
@@ -270,7 +269,6 @@ impl AddAssign for GcResult {
|
||||
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
|
||||
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
|
||||
self.layers_needed_by_branches += other.layers_needed_by_branches;
|
||||
self.layers_needed_by_leases += other.layers_needed_by_leases;
|
||||
self.layers_not_updated += other.layers_not_updated;
|
||||
self.layers_removed += other.layers_removed;
|
||||
|
||||
|
||||
@@ -31,7 +31,6 @@ use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use std::fmt;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::sync::watch;
|
||||
@@ -66,9 +65,9 @@ use self::timeline::uninit::TimelineCreateGuard;
|
||||
use self::timeline::uninit::TimelineExclusionError;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::GcCutoffs;
|
||||
use self::timeline::TimelineResources;
|
||||
use self::timeline::WaitLsnError;
|
||||
use self::timeline::{GcCutoffs, GcInfo};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
@@ -510,24 +509,11 @@ pub(crate) enum GcError {
|
||||
#[error(transparent)]
|
||||
Remote(anyhow::Error),
|
||||
|
||||
// An error reading while calculating GC cutoffs
|
||||
#[error(transparent)]
|
||||
GcCutoffs(PageReconstructError),
|
||||
|
||||
// If GC was invoked for a particular timeline, this error means it didn't exist
|
||||
#[error("timeline not found")]
|
||||
TimelineNotFound,
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for GcError {
|
||||
fn from(value: PageReconstructError) -> Self {
|
||||
match value {
|
||||
PageReconstructError::Cancelled => Self::TimelineCancelled,
|
||||
other => Self::GcCutoffs(other),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Tenant {
|
||||
/// Yet another helper for timeline initialization.
|
||||
///
|
||||
@@ -1047,6 +1033,7 @@ impl Tenant {
|
||||
remote_metadata,
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
deletion_queue_client: self.deletion_queue_client.clone(),
|
||||
timeline_get_throttle: self.timeline_get_throttle.clone(),
|
||||
},
|
||||
ctx,
|
||||
@@ -1072,6 +1059,7 @@ impl Tenant {
|
||||
timeline_id,
|
||||
&index_part.metadata,
|
||||
remote_timeline_client,
|
||||
self.deletion_queue_client.clone(),
|
||||
)
|
||||
.instrument(tracing::info_span!("timeline_delete", %timeline_id))
|
||||
.await
|
||||
@@ -2933,9 +2921,17 @@ impl Tenant {
|
||||
.checked_sub(horizon)
|
||||
.unwrap_or(Lsn(0));
|
||||
|
||||
let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
|
||||
let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
|
||||
assert!(old.is_none());
|
||||
let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
|
||||
|
||||
match res {
|
||||
Ok(cutoffs) => {
|
||||
let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
|
||||
assert!(old.is_none());
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !self.is_active() || self.cancel.is_cancelled() {
|
||||
@@ -3011,13 +3007,12 @@ impl Tenant {
|
||||
{
|
||||
let mut target = timeline.gc_info.write().unwrap();
|
||||
|
||||
let now = SystemTime::now();
|
||||
target.leases.retain(|_, lease| !lease.is_expired(&now));
|
||||
|
||||
match gc_cutoffs.remove(&timeline.timeline_id) {
|
||||
Some(cutoffs) => {
|
||||
target.retain_lsns = branchpoints;
|
||||
target.cutoffs = cutoffs;
|
||||
*target = GcInfo {
|
||||
retain_lsns: branchpoints,
|
||||
cutoffs,
|
||||
};
|
||||
}
|
||||
None => {
|
||||
// reasons for this being unavailable:
|
||||
@@ -3400,12 +3395,6 @@ impl Tenant {
|
||||
let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
|
||||
let unfinished_timeline = raw_timeline.raw_timeline()?;
|
||||
|
||||
// Flush the new layer files to disk, before we make the timeline as available to
|
||||
// the outside world.
|
||||
//
|
||||
// Flush loop needs to be spawned in order to be able to flush.
|
||||
unfinished_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
import_datadir::import_timeline_from_postgres_datadir(
|
||||
unfinished_timeline,
|
||||
&pgdata_path,
|
||||
@@ -3417,6 +3406,12 @@ impl Tenant {
|
||||
format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
// Flush the new layer files to disk, before we make the timeline as available to
|
||||
// the outside world.
|
||||
//
|
||||
// Flush loop needs to be spawned in order to be able to flush.
|
||||
unfinished_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
anyhow::bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
@@ -3448,6 +3443,7 @@ impl Tenant {
|
||||
);
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
deletion_queue_client: self.deletion_queue_client.clone(),
|
||||
timeline_get_throttle: self.timeline_get_throttle.clone(),
|
||||
}
|
||||
}
|
||||
@@ -3557,7 +3553,7 @@ impl Tenant {
|
||||
cause: LogicalSizeCalculationCause,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<size::ModelInputs, size::CalculateSyntheticSizeError> {
|
||||
) -> anyhow::Result<size::ModelInputs> {
|
||||
let logical_sizes_at_once = self
|
||||
.conf
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
@@ -3572,8 +3568,8 @@ impl Tenant {
|
||||
// See more for on the issue #2748 condenced out of the initial PR review.
|
||||
let mut shared_cache = tokio::select! {
|
||||
locked = self.cached_logical_sizes.lock() => locked,
|
||||
_ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
|
||||
_ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
|
||||
_ = cancel.cancelled() => anyhow::bail!("cancelled"),
|
||||
_ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
|
||||
};
|
||||
|
||||
size::gather_inputs(
|
||||
@@ -3597,10 +3593,10 @@ impl Tenant {
|
||||
cause: LogicalSizeCalculationCause,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, size::CalculateSyntheticSizeError> {
|
||||
) -> anyhow::Result<u64> {
|
||||
let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;
|
||||
|
||||
let size = inputs.calculate();
|
||||
let size = inputs.calculate()?;
|
||||
|
||||
self.set_cached_synthetic_size(size);
|
||||
|
||||
@@ -4045,17 +4041,13 @@ mod tests {
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hex_literal::hex;
|
||||
use itertools::Itertools;
|
||||
use models::LsnLease;
|
||||
use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use rand::{thread_rng, Rng};
|
||||
use storage_layer::PersistentLayerKey;
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
use utils::bin_ser::BeSer;
|
||||
@@ -4264,66 +4256,6 @@ mod tests {
|
||||
tline.freeze_and_flush().await.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_lsn_leases() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_leases")?.load().await;
|
||||
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
|
||||
let end_lsn = Lsn(0x100);
|
||||
let image_layers = (0x20..=0x90)
|
||||
.step_by(0x10)
|
||||
.map(|n| {
|
||||
(
|
||||
Lsn(n),
|
||||
vec![(key, test_img(&format!("data key at {:x}", n)))],
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(),
|
||||
image_layers,
|
||||
end_lsn,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let leased_lsns = [0x30, 0x50, 0x70];
|
||||
let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| {
|
||||
let _ = timeline.make_lsn_lease(Lsn(*n), LsnLease::DEFAULT_LENGTH, &ctx)?;
|
||||
Ok(())
|
||||
});
|
||||
|
||||
// Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
|
||||
|
||||
timeline.force_set_disk_consistent_lsn(end_lsn);
|
||||
|
||||
let res = tenant
|
||||
.gc_iteration(
|
||||
Some(TIMELINE_ID),
|
||||
0,
|
||||
Duration::ZERO,
|
||||
&CancellationToken::new(),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Keeping everything <= Lsn(0x80) b/c leases:
|
||||
// 0/10: initdb layer
|
||||
// (0/20..=0/70).step_by(0x10): image layers added when creating the timeline.
|
||||
assert_eq!(res.layers_needed_by_leases, 7);
|
||||
// Keeping 0/90 b/c it is the latest layer.
|
||||
assert_eq!(res.layers_not_updated, 1);
|
||||
// Removed 0/80.
|
||||
assert_eq!(res.layers_removed, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
@@ -5330,9 +5262,6 @@ mod tests {
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let mut test_key_end = test_key;
|
||||
test_key_end.field6 = NUM_KEYS as u32;
|
||||
tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end));
|
||||
|
||||
let mut keyspace = KeySpaceAccum::new();
|
||||
|
||||
@@ -6292,8 +6221,8 @@ mod tests {
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
|
||||
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
base_key.field1 = AUX_KEY_PREFIX;
|
||||
let mut test_key = base_key;
|
||||
let mut lsn = Lsn(0x10);
|
||||
|
||||
@@ -6398,7 +6327,6 @@ mod tests {
|
||||
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
|
||||
)
|
||||
.await?;
|
||||
tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next())));
|
||||
|
||||
let child = tenant
|
||||
.branch_timeline_test_with_layers(
|
||||
@@ -6656,8 +6584,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_image_creation() {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
|
||||
async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
@@ -6685,8 +6613,7 @@ mod tests {
|
||||
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
|
||||
Lsn(0x30),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
@@ -6701,24 +6628,23 @@ mod tests {
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
.await?;
|
||||
|
||||
// Image layers are created at last_record_lsn
|
||||
let images = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
.await
|
||||
.unwrap()
|
||||
.await?
|
||||
.into_iter()
|
||||
.filter(|(k, _)| k.is_metadata_key())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_empty_image_creation() {
|
||||
let harness =
|
||||
TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
|
||||
async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
@@ -6740,8 +6666,7 @@ mod tests {
|
||||
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
|
||||
Lsn(0x30),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
@@ -6756,249 +6681,16 @@ mod tests {
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
.await?;
|
||||
|
||||
// Image layers are created at last_record_lsn
|
||||
let images = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
.await
|
||||
.unwrap()
|
||||
.await?
|
||||
.into_iter()
|
||||
.filter(|(k, _)| k.is_metadata_key())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
// We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
|
||||
//
|
||||
// | D1 | | D3 |
|
||||
// -| |-- gc horizon -----------------
|
||||
// | | | D2 |
|
||||
// --------- img layer ------------------
|
||||
//
|
||||
// What we should expact from this compaction is:
|
||||
// | Part of D1 | | D3 |
|
||||
// --------- img layer with D1+D2 at GC horizon------------------
|
||||
|
||||
// img layer at 0x10
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), test_img(&format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
// TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose.
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::Image(test_img("value 1@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x30),
|
||||
Value::Image(test_img("value 2@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x40),
|
||||
Value::Image(test_img("value 3@0x40")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::Image(test_img("value 5@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(6),
|
||||
Lsn(0x20),
|
||||
Value::Image(test_img("value 6@0x20")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x40),
|
||||
Value::Image(test_img("value 8@0x40")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x40),
|
||||
Value::Image(test_img("value 9@0x40")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![delta1, delta2, delta3], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.pitr = Lsn(0x30);
|
||||
guard.cutoffs.horizon = Lsn(0x30);
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
|
||||
// Check if the image layer at the GC horizon contains exactly what we want
|
||||
let image_at_gc_horizon = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.filter(|(k, _)| k.is_metadata_key())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(image_at_gc_horizon.len(), 10);
|
||||
let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10];
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
image_at_gc_horizon[idx],
|
||||
(
|
||||
get_key(idx as u32),
|
||||
test_img(&format!("value {idx}@{:#x}", expected_lsn[idx]))
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Check if old layers are removed / new layers have the expected LSN
|
||||
let mut all_layers = tline.inspect_historic_layers().await.unwrap();
|
||||
all_layers.sort_by(|k1, k2| {
|
||||
(
|
||||
k1.is_delta,
|
||||
k1.key_range.start,
|
||||
k1.key_range.end,
|
||||
k1.lsn_range.start,
|
||||
k1.lsn_range.end,
|
||||
)
|
||||
.cmp(&(
|
||||
k2.is_delta,
|
||||
k2.key_range.start,
|
||||
k2.key_range.end,
|
||||
k2.lsn_range.start,
|
||||
k2.lsn_range.end,
|
||||
))
|
||||
});
|
||||
assert_eq!(
|
||||
all_layers,
|
||||
vec![
|
||||
// Image layer at GC horizon
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..get_key(10),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||
is_delta: false
|
||||
},
|
||||
// The delta layer that is cut in the middle
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..get_key(9),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x41),
|
||||
is_delta: true
|
||||
},
|
||||
// The delta layer we created and should not be picked for the compaction
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x40)..Lsn(0x41),
|
||||
is_delta: true
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_neon_test_record() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_neon_test_record")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
|
||||
),
|
||||
(get_key(2), Lsn(0x10), Value::Image("0x10".into())),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
|
||||
),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
|
||||
),
|
||||
(get_key(3), Lsn(0x10), Value::Image("0x10".into())),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_clear()),
|
||||
),
|
||||
(get_key(4), Lsn(0x10), Value::Image("0x10".into())),
|
||||
(
|
||||
get_key(4),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_init()),
|
||||
),
|
||||
];
|
||||
let image1 = vec![(get_key(1), "0x10".into())];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![delta1], // delta layers
|
||||
vec![(Lsn(0x10), image1)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
tline.get(get_key(1), Lsn(0x50), &ctx).await?,
|
||||
Bytes::from_static(b"0x10,0x20,0x30")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(get_key(2), Lsn(0x50), &ctx).await?,
|
||||
Bytes::from_static(b"0x10,0x20,0x30")
|
||||
);
|
||||
// assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
|
||||
// assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,23 +1,15 @@
|
||||
//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
|
||||
//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
|
||||
//! this struct and it's original serialization format is still needed because they were written a
|
||||
//! long time ago.
|
||||
//! Every image of a certain timeline from [`crate::tenant::Tenant`]
|
||||
//! has a metadata that needs to be stored persistently.
|
||||
//!
|
||||
//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
|
||||
//! versioning.
|
||||
//! Later, the file gets used in [`remote_timeline_client`] as a part of
|
||||
//! external storage import and export operations.
|
||||
//!
|
||||
//! To clean up this module we need to migrate all index_part.json files to a later version.
|
||||
//! While doing this, we need to be mindful about s3 based recovery as well, so it might take
|
||||
//! however long we keep the old versions to be able to delete the old code. After that, we can
|
||||
//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and
|
||||
//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards
|
||||
//! compatibility.
|
||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||
//!
|
||||
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||
//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart
|
||||
|
||||
use anyhow::ensure;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
|
||||
|
||||
@@ -25,37 +17,17 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
|
||||
const METADATA_FORMAT_VERSION: u16 = 4;
|
||||
|
||||
/// Previous supported format versions.
|
||||
///
|
||||
/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming
|
||||
/// that requires a scrubber run which is yet to be done.
|
||||
const METADATA_OLD_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic.
|
||||
/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
|
||||
///
|
||||
/// This is the same assumption that PostgreSQL makes with the control file,
|
||||
///
|
||||
/// see PG_CONTROL_MAX_SAFE_SIZE
|
||||
const METADATA_MAX_SIZE: usize = 512;
|
||||
|
||||
/// Legacy metadata stored as a component of `index_part.json` per timeline.
|
||||
/// Metadata stored on disk for each timeline
|
||||
///
|
||||
/// Do not make new changes to this type or the module. In production, we have two different kinds
|
||||
/// of serializations of this type: bincode and json. Bincode version reflects what used to be
|
||||
/// stored on disk in earlier versions and does internal crc32 checksumming.
|
||||
///
|
||||
/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would
|
||||
/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern
|
||||
/// as-exists in `index_part.json` ([`self::modern_serde`]).
|
||||
///
|
||||
/// ```compile_fail
|
||||
/// #[derive(serde::Serialize)]
|
||||
/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata);
|
||||
/// ```
|
||||
///
|
||||
/// ```compile_fail
|
||||
/// #[derive(serde::Deserialize)]
|
||||
/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata);
|
||||
/// ```
|
||||
/// The fields correspond to the values we hold in memory, in Timeline.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TimelineMetadata {
|
||||
hdr: TimelineMetadataHeader,
|
||||
@@ -68,49 +40,6 @@ struct TimelineMetadataHeader {
|
||||
size: u16, // size of serialized metadata
|
||||
format_version: u16, // metadata format version (used for compatibility checks)
|
||||
}
|
||||
|
||||
impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
|
||||
type Error = Crc32CalculationFailed;
|
||||
|
||||
fn try_from(value: &TimelineMetadataBodyV2) -> Result<Self, Self::Error> {
|
||||
#[derive(Default)]
|
||||
struct Crc32Sink {
|
||||
crc: u32,
|
||||
count: usize,
|
||||
}
|
||||
|
||||
impl std::io::Write for Crc32Sink {
|
||||
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
self.crc = crc32c::crc32c_append(self.crc, buf);
|
||||
self.count += buf.len();
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
|
||||
// across serialization versions
|
||||
let mut sink = Crc32Sink::default();
|
||||
<TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(value, &mut sink)
|
||||
.map_err(Crc32CalculationFailed)?;
|
||||
|
||||
let size = METADATA_HDR_SIZE + sink.count;
|
||||
|
||||
Ok(TimelineMetadataHeader {
|
||||
checksum: sink.crc,
|
||||
size: size as u16,
|
||||
format_version: METADATA_FORMAT_VERSION,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("re-serializing for crc32 failed")]
|
||||
struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
|
||||
|
||||
const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -182,12 +111,6 @@ impl TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result<Self> {
|
||||
self.hdr = TimelineMetadataHeader::try_from(&self.body)?;
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
|
||||
|
||||
@@ -338,8 +261,32 @@ impl TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for TimelineMetadata {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let bytes = Vec::<u8>::deserialize(deserializer)?;
|
||||
Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for TimelineMetadata {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?;
|
||||
bytes.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) mod modern_serde {
|
||||
use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader};
|
||||
use crate::tenant::metadata::METADATA_FORMAT_VERSION;
|
||||
|
||||
use super::{
|
||||
TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
|
||||
@@ -375,15 +322,71 @@ pub(crate) mod modern_serde {
|
||||
|
||||
let de = serde::de::value::MapAccessDeserializer::new(map);
|
||||
let body = TimelineMetadataBodyV2::deserialize(de)?;
|
||||
let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?;
|
||||
|
||||
Ok(TimelineMetadata { hdr, body })
|
||||
// jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
|
||||
// across serialization versions
|
||||
let mut sink = Crc32Sink::default();
|
||||
<TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(&body, &mut sink)
|
||||
.map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?;
|
||||
|
||||
let size = METADATA_HDR_SIZE + sink.count;
|
||||
|
||||
Ok(TimelineMetadata {
|
||||
hdr: TimelineMetadataHeader {
|
||||
checksum: sink.crc,
|
||||
size: size as u16,
|
||||
format_version: METADATA_FORMAT_VERSION,
|
||||
},
|
||||
body,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_any(Visitor)
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Crc32Sink {
|
||||
crc: u32,
|
||||
count: usize,
|
||||
}
|
||||
|
||||
impl std::io::Write for Crc32Sink {
|
||||
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
self.crc = crc32c::crc32c_append(self.crc, buf);
|
||||
self.count += buf.len();
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
#[error("re-serializing for crc32 failed")]
|
||||
struct Crc32CalculationFailed<E>(#[source] E);
|
||||
|
||||
// this should be true for one release, after that we can change it to false
|
||||
// remember to check the IndexPart::metadata field TODO comment as well
|
||||
const LEGACY_BINCODED_BYTES: bool = true;
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
#[serde(transparent)]
|
||||
struct LegacyPaddedBytes<'a>(&'a TimelineMetadata);
|
||||
|
||||
struct JustTheBodyV2<'a>(&'a TimelineMetadata);
|
||||
|
||||
impl serde::Serialize for JustTheBodyV2<'_> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
// header is not needed, upon reading we've upgraded all v1 to v2
|
||||
self.0.body.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn serialize<S>(
|
||||
metadata: &TimelineMetadata,
|
||||
serializer: S,
|
||||
@@ -391,23 +394,25 @@ pub(crate) mod modern_serde {
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
// header is not needed, upon reading we've upgraded all v1 to v2
|
||||
metadata.body.serialize(serializer)
|
||||
// we cannot use TimelineMetadata::serialize for now because it'll do
|
||||
// TimelineMetadata::to_bytes
|
||||
if LEGACY_BINCODED_BYTES {
|
||||
LegacyPaddedBytes(metadata).serialize(serializer)
|
||||
} else {
|
||||
JustTheBodyV2(metadata).serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deserializes_bytes_as_well_as_equivalent_body_v2() {
|
||||
#[derive(serde::Deserialize, serde::Serialize)]
|
||||
struct Wrapper(
|
||||
#[serde(deserialize_with = "deserialize", serialize_with = "serialize")]
|
||||
TimelineMetadata,
|
||||
);
|
||||
struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata);
|
||||
|
||||
let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
|
||||
|
||||
let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
|
||||
|
||||
let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap();
|
||||
let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
serialized,
|
||||
@@ -548,6 +553,59 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_metadata_bincode_serde() {
|
||||
let original_metadata = TimelineMetadata::new(
|
||||
Lsn(0x200),
|
||||
Some(Lsn(0x100)),
|
||||
Some(TIMELINE_ID),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
// Any version will do here, so use the default
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
);
|
||||
let metadata_bytes = original_metadata
|
||||
.to_bytes()
|
||||
.expect("Cannot create bytes array from metadata");
|
||||
|
||||
let metadata_bincode_be_bytes = original_metadata
|
||||
.ser()
|
||||
.expect("Cannot serialize the metadata");
|
||||
|
||||
// 8 bytes for the length of the vector
|
||||
assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
|
||||
|
||||
let expected_bincode_bytes = {
|
||||
let mut temp = vec![];
|
||||
let len_bytes = metadata_bytes.len().to_be_bytes();
|
||||
temp.extend_from_slice(&len_bytes);
|
||||
temp.extend_from_slice(&metadata_bytes);
|
||||
temp
|
||||
};
|
||||
assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
|
||||
|
||||
let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
|
||||
// Deserialized metadata has the metadata header, which is different from the serialized one.
|
||||
// Reference: TimelineMetaData::to_bytes()
|
||||
let expected_metadata = {
|
||||
let mut temp_metadata = original_metadata;
|
||||
let body_bytes = temp_metadata
|
||||
.body
|
||||
.ser()
|
||||
.expect("Cannot serialize the metadata body");
|
||||
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
|
||||
let hdr = TimelineMetadataHeader {
|
||||
size: metadata_size as u16,
|
||||
format_version: METADATA_FORMAT_VERSION,
|
||||
checksum: crc32c::crc32c(&body_bytes),
|
||||
};
|
||||
temp_metadata.hdr = hdr;
|
||||
temp_metadata
|
||||
};
|
||||
assert_eq!(deserialized_metadata, expected_metadata);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_metadata_bincode_serde_ensure_roundtrip() {
|
||||
let original_metadata = TimelineMetadata::new(
|
||||
@@ -561,6 +619,8 @@ mod tests {
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
);
|
||||
let expected_bytes = vec![
|
||||
/* bincode length encoding bytes */
|
||||
0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
|
||||
/* TimelineMetadataHeader */
|
||||
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
/* TimelineMetadataBodyV2 */
|
||||
@@ -590,7 +650,7 @@ mod tests {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
let metadata_ser_bytes = original_metadata.to_bytes().unwrap();
|
||||
let metadata_ser_bytes = original_metadata.ser().unwrap();
|
||||
assert_eq!(metadata_ser_bytes, expected_bytes);
|
||||
|
||||
let expected_metadata = {
|
||||
@@ -608,7 +668,7 @@ mod tests {
|
||||
temp_metadata.hdr = hdr;
|
||||
temp_metadata
|
||||
};
|
||||
let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap();
|
||||
let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
|
||||
assert_eq!(des_metadata, expected_metadata);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,17 +38,14 @@ pub struct IndexPart {
|
||||
/// that latest version stores.
|
||||
pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
|
||||
|
||||
/// Because of the trouble of eyeballing the legacy "metadata" field, we copied the
|
||||
/// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be
|
||||
/// reused.
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated for convenience when reading the serialized structure, but is
|
||||
// private because internally we would read from metadata instead.
|
||||
pub(super) disk_consistent_lsn: Lsn,
|
||||
|
||||
// TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding
|
||||
// the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes"
|
||||
// for backwards compatibility.
|
||||
// TODO: later make this "rename" to "alias", rename field as "legacy_metadata"
|
||||
#[serde(
|
||||
rename = "metadata_bytes",
|
||||
alias = "metadata",
|
||||
with = "crate::tenant::metadata::modern_serde"
|
||||
)]
|
||||
pub metadata: TimelineMetadata,
|
||||
@@ -79,11 +76,10 @@ impl IndexPart {
|
||||
/// - 4: timeline_layers is fully removed.
|
||||
/// - 5: lineage was added
|
||||
/// - 6: last_aux_file_policy is added.
|
||||
/// - 7: metadata_bytes is no longer written, but still read
|
||||
const LATEST_VERSION: usize = 7;
|
||||
const LATEST_VERSION: usize = 6;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -99,7 +95,7 @@ impl IndexPart {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn version(&self) -> usize {
|
||||
pub fn get_version(&self) -> usize {
|
||||
self.version
|
||||
}
|
||||
|
||||
@@ -221,9 +217,9 @@ impl Lineage {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::str::FromStr;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn v1_indexpart_is_parsed() {
|
||||
@@ -342,7 +338,8 @@ mod tests {
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
@@ -518,7 +515,8 @@ mod tests {
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||
lineage: Lineage {
|
||||
reparenting_history_truncated: false,
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
@@ -531,60 +529,6 @@ mod tests {
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v7_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"version": 7,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata": {
|
||||
"disk_consistent_lsn": "0/16960E8",
|
||||
"prev_record_lsn": "0/1696070",
|
||||
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/1696070",
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"deleted_at": "2023-07-31T09:00:00.123"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 7,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::new(
|
||||
Lsn::from_str("0/16960E8").unwrap(),
|
||||
Some(Lsn::from_str("0/1696070").unwrap()),
|
||||
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
lineage: Default::default(),
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
fn parse_naive_datetime(s: &str) -> NaiveDateTime {
|
||||
chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
|
||||
}
|
||||
|
||||
@@ -513,7 +513,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
// cover our access to local storage.
|
||||
let Ok(_guard) = self.secondary_state.gate.enter() else {
|
||||
// Shutting down
|
||||
return Err(UpdateError::Cancelled);
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
@@ -846,7 +846,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
for layer in timeline.layers {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!("Cancelled -- dropping out of layer loop");
|
||||
return Err(UpdateError::Cancelled);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Existing on-disk layers: just update their access time.
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::sync::oneshot::error::RecvError;
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -10,7 +11,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
|
||||
use super::{GcError, LogicalSizeCalculationCause, Tenant};
|
||||
use super::{LogicalSizeCalculationCause, Tenant};
|
||||
use crate::tenant::Timeline;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -42,44 +43,6 @@ pub struct SegmentMeta {
|
||||
pub kind: LsnKind,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum CalculateSyntheticSizeError {
|
||||
/// Something went wrong internally to the calculation of logical size at a particular branch point
|
||||
#[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")]
|
||||
LogicalSize {
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
error: CalculateLogicalSizeError,
|
||||
},
|
||||
|
||||
/// Something went wrong internally when calculating GC parameters at start of size calculation
|
||||
#[error(transparent)]
|
||||
GcInfo(GcError),
|
||||
|
||||
/// Totally unexpected errors, like panics joining a task
|
||||
#[error(transparent)]
|
||||
Fatal(anyhow::Error),
|
||||
|
||||
/// The LSN we are trying to calculate a size at no longer exists at the point we query it
|
||||
#[error("Could not find size at {lsn} in timeline {timeline_id}")]
|
||||
LsnNotFound { timeline_id: TimelineId, lsn: Lsn },
|
||||
|
||||
/// Tenant shut down while calculating size
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl From<GcError> for CalculateSyntheticSizeError {
|
||||
fn from(value: GcError) -> Self {
|
||||
match value {
|
||||
GcError::TenantCancelled | GcError::TimelineCancelled => {
|
||||
CalculateSyntheticSizeError::Cancelled
|
||||
}
|
||||
other => CalculateSyntheticSizeError::GcInfo(other),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentMeta {
|
||||
fn size_needed(&self) -> bool {
|
||||
match self.kind {
|
||||
@@ -153,9 +116,12 @@ pub(super) async fn gather_inputs(
|
||||
cause: LogicalSizeCalculationCause,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ModelInputs, CalculateSyntheticSizeError> {
|
||||
) -> anyhow::Result<ModelInputs> {
|
||||
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
||||
tenant.refresh_gc_info(cancel, ctx).await?;
|
||||
tenant
|
||||
.refresh_gc_info(cancel, ctx)
|
||||
.await
|
||||
.context("Failed to refresh gc_info before gathering inputs")?;
|
||||
|
||||
// Collect information about all the timelines
|
||||
let mut timelines = tenant.list_timelines();
|
||||
@@ -361,12 +327,6 @@ pub(super) async fn gather_inputs(
|
||||
)
|
||||
.await?;
|
||||
|
||||
if tenant.cancel.is_cancelled() {
|
||||
// If we're shutting down, return an error rather than a sparse result that might include some
|
||||
// timelines from before we started shutting down
|
||||
return Err(CalculateSyntheticSizeError::Cancelled);
|
||||
}
|
||||
|
||||
Ok(ModelInputs {
|
||||
segments,
|
||||
timeline_inputs,
|
||||
@@ -385,7 +345,7 @@ async fn fill_logical_sizes(
|
||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||
cause: LogicalSizeCalculationCause,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CalculateSyntheticSizeError> {
|
||||
) -> anyhow::Result<()> {
|
||||
let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
|
||||
timelines
|
||||
.iter()
|
||||
@@ -427,7 +387,7 @@ async fn fill_logical_sizes(
|
||||
}
|
||||
|
||||
// Perform the size lookups
|
||||
let mut have_any_error = None;
|
||||
let mut have_any_error = false;
|
||||
while let Some(res) = joinset.join_next().await {
|
||||
// each of these come with Result<anyhow::Result<_>, JoinError>
|
||||
// because of spawn + spawn_blocking
|
||||
@@ -438,36 +398,21 @@ async fn fill_logical_sizes(
|
||||
Err(join_error) => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
|
||||
|
||||
have_any_error = Some(CalculateSyntheticSizeError::Fatal(
|
||||
anyhow::anyhow!(join_error)
|
||||
.context("task that calls spawn_ondemand_logical_size_calculation"),
|
||||
));
|
||||
have_any_error = true;
|
||||
}
|
||||
Ok(Err(recv_result_error)) => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
error!("failed to receive logical size query result: {recv_result_error:#}");
|
||||
have_any_error = Some(CalculateSyntheticSizeError::Fatal(
|
||||
anyhow::anyhow!(recv_result_error)
|
||||
.context("Receiving logical size query result"),
|
||||
));
|
||||
have_any_error = true;
|
||||
}
|
||||
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
|
||||
if matches!(error, CalculateLogicalSizeError::Cancelled) {
|
||||
// Skip this: it's okay if one timeline among many is shutting down while we
|
||||
// calculate inputs for the overall tenant.
|
||||
continue;
|
||||
} else {
|
||||
if !matches!(error, CalculateLogicalSizeError::Cancelled) {
|
||||
warn!(
|
||||
timeline_id=%timeline.timeline_id,
|
||||
"failed to calculate logical size at {lsn}: {error:#}"
|
||||
);
|
||||
have_any_error = Some(CalculateSyntheticSizeError::LogicalSize {
|
||||
timeline_id: timeline.timeline_id,
|
||||
lsn,
|
||||
error,
|
||||
});
|
||||
}
|
||||
have_any_error = true;
|
||||
}
|
||||
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
|
||||
debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
|
||||
@@ -481,10 +426,10 @@ async fn fill_logical_sizes(
|
||||
// prune any keys not needed anymore; we record every used key and added key.
|
||||
logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));
|
||||
|
||||
if let Some(error) = have_any_error {
|
||||
if have_any_error {
|
||||
// we cannot complete this round, because we are missing data.
|
||||
// we have however cached all we were able to request calculation on.
|
||||
return Err(error);
|
||||
anyhow::bail!("failed to calculate some logical_sizes");
|
||||
}
|
||||
|
||||
// Insert the looked up sizes to the Segments
|
||||
@@ -499,29 +444,32 @@ async fn fill_logical_sizes(
|
||||
if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
|
||||
seg.segment.size = Some(*size);
|
||||
} else {
|
||||
return Err(CalculateSyntheticSizeError::LsnNotFound { timeline_id, lsn });
|
||||
bail!("could not find size at {} in timeline {}", lsn, timeline_id);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl ModelInputs {
|
||||
pub fn calculate_model(&self) -> tenant_size_model::StorageModel {
|
||||
pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
|
||||
// Convert SegmentMetas into plain Segments
|
||||
StorageModel {
|
||||
let storage = StorageModel {
|
||||
segments: self
|
||||
.segments
|
||||
.iter()
|
||||
.map(|seg| seg.segment.clone())
|
||||
.collect(),
|
||||
}
|
||||
};
|
||||
|
||||
Ok(storage)
|
||||
}
|
||||
|
||||
// calculate total project size
|
||||
pub fn calculate(&self) -> u64 {
|
||||
let storage = self.calculate_model();
|
||||
pub fn calculate(&self) -> anyhow::Result<u64> {
|
||||
let storage = self.calculate_model()?;
|
||||
let sizes = storage.calculate();
|
||||
sizes.total_size
|
||||
|
||||
Ok(sizes.total_size)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -708,7 +656,7 @@ fn verify_size_for_multiple_branches() {
|
||||
"#;
|
||||
let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
|
||||
|
||||
assert_eq!(inputs.calculate(), 37_851_408);
|
||||
assert_eq!(inputs.calculate().unwrap(), 37_851_408);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -763,7 +711,7 @@ fn verify_size_for_one_branch() {
|
||||
|
||||
let model: ModelInputs = serde_json::from_str(doc).unwrap();
|
||||
|
||||
let res = model.calculate_model().calculate();
|
||||
let res = model.calculate_model().unwrap().calculate();
|
||||
|
||||
println!("calculated synthetic size: {}", res.total_size);
|
||||
println!("result: {:?}", serde_json::to_string(&res.segments));
|
||||
|
||||
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
|
||||
#[derive(Debug)]
|
||||
struct LayerKeyspace {
|
||||
layer: ReadableLayer,
|
||||
target_keyspace: KeySpaceRandomAccum,
|
||||
target_keyspace: Vec<KeySpace>,
|
||||
}
|
||||
|
||||
impl LayerFringe {
|
||||
@@ -342,13 +342,17 @@ impl LayerFringe {
|
||||
_,
|
||||
LayerKeyspace {
|
||||
layer,
|
||||
mut target_keyspace,
|
||||
target_keyspace,
|
||||
},
|
||||
)) => Some((
|
||||
layer,
|
||||
target_keyspace.consume_keyspace(),
|
||||
read_desc.lsn_range,
|
||||
)),
|
||||
)) => {
|
||||
let mut keyspace = KeySpaceRandomAccum::new();
|
||||
for ks in target_keyspace {
|
||||
for part in ks.ranges {
|
||||
keyspace.add_range(part);
|
||||
}
|
||||
}
|
||||
Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
|
||||
}
|
||||
None => unreachable!("fringe internals are always consistent"),
|
||||
}
|
||||
}
|
||||
@@ -363,18 +367,16 @@ impl LayerFringe {
|
||||
let entry = self.layers.entry(layer_id.clone());
|
||||
match entry {
|
||||
Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().target_keyspace.add_keyspace(keyspace);
|
||||
entry.get_mut().target_keyspace.push(keyspace);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
self.planned_reads_by_lsn.push(ReadDesc {
|
||||
lsn_range,
|
||||
layer_id: layer_id.clone(),
|
||||
});
|
||||
let mut accum = KeySpaceRandomAccum::new();
|
||||
accum.add_keyspace(keyspace);
|
||||
entry.insert(LayerKeyspace {
|
||||
layer,
|
||||
target_keyspace: accum,
|
||||
target_keyspace: vec![keyspace],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -219,6 +219,7 @@ pub struct DeltaLayerInner {
|
||||
// values copied from summary
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
file: VirtualFile,
|
||||
file_id: FileId,
|
||||
@@ -784,6 +785,7 @@ impl DeltaLayerInner {
|
||||
file_id,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
lsn_range: actual_summary.lsn_range,
|
||||
max_vectored_read_bytes,
|
||||
}))
|
||||
}
|
||||
@@ -909,7 +911,7 @@ impl DeltaLayerInner {
|
||||
|
||||
let reads = Self::plan_reads(
|
||||
&keyspace,
|
||||
lsn_range.clone(),
|
||||
lsn_range,
|
||||
data_end_offset,
|
||||
index_reader,
|
||||
planner,
|
||||
@@ -922,50 +924,11 @@ impl DeltaLayerInner {
|
||||
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start);
|
||||
reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
block_reader,
|
||||
);
|
||||
let mut result = Vec::new();
|
||||
let mut stream =
|
||||
Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
while let Some(item) = stream.next().await {
|
||||
let (key, lsn, pos) = item?;
|
||||
// TODO: dedup code with get_reconstruct_value
|
||||
// TODO: ctx handling and sharding
|
||||
cursor
|
||||
.read_blob_into_buf(pos.pos(), &mut buf, ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to read blob from virtual file {}", self.file.path)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
self.file.path
|
||||
)
|
||||
})?;
|
||||
result.push((key, lsn, val));
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn plan_reads<Reader>(
|
||||
keyspace: &KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
@@ -485,34 +485,6 @@ impl ImageLayerInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
let mut result = Vec::new();
|
||||
let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
while let Some(item) = stream.next().await {
|
||||
// TODO: dedup code with get_reconstruct_value
|
||||
let (raw_key, offset) = item?;
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
// TODO: ctx handling and sharding
|
||||
let blob = cursor
|
||||
.read_blob(offset, ctx)
|
||||
.await
|
||||
.with_context(|| format!("failed to read value from offset {}", offset))?;
|
||||
let value = Bytes::from(blob);
|
||||
result.push((key, self.lsn, Value::Image(value)));
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Traverse the layer's index to build read operations on the overlap of the input keyspace
|
||||
/// and the keys in this layer.
|
||||
///
|
||||
|
||||
@@ -52,7 +52,7 @@ pub struct InMemoryLayer {
|
||||
|
||||
/// Frozen layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is `None`.
|
||||
pub(crate) end_lsn: OnceLock<Lsn>,
|
||||
end_lsn: OnceLock<Lsn>,
|
||||
|
||||
/// Used for traversal path. Cached representation of the in-memory layer before frozen.
|
||||
local_path_str: Arc<str>,
|
||||
|
||||
@@ -388,23 +388,6 @@ impl Layer {
|
||||
})
|
||||
}
|
||||
|
||||
/// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
|
||||
let layer = self
|
||||
.0
|
||||
.get_or_maybe_download(true, Some(ctx))
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
layer.load_key_values(&self.0, ctx).await
|
||||
}
|
||||
|
||||
/// Download the layer if evicted.
|
||||
///
|
||||
/// Will not error when the layer is already downloaded.
|
||||
@@ -1774,20 +1757,6 @@ impl DownloadedLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn load_key_values(
|
||||
&self,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self.get(owner, ctx).await? {
|
||||
Delta(d) => d.load_key_values(ctx).await,
|
||||
Image(i) => i.load_key_values(ctx).await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
use LayerKind::*;
|
||||
match self.get(owner, ctx).await? {
|
||||
|
||||
@@ -815,7 +815,6 @@ async fn eviction_cancellation_on_drop() {
|
||||
/// A test case to remind you the cost of these structures. You can bump the size limit
|
||||
/// below if it is really necessary to add more fields to the structures.
|
||||
#[test]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn layer_size() {
|
||||
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
|
||||
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
|
||||
|
||||
@@ -14,7 +14,6 @@ use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
|
||||
use crate::tenant::throttle::Stats;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use pageserver_api::models::LsnLease;
|
||||
use rand::Rng;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -347,10 +346,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
// cutoff specified as time.
|
||||
let ctx =
|
||||
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
|
||||
// Delay GC by default lease period at pageserver restart.
|
||||
tokio::time::sleep(LsnLease::DEFAULT_LENGTH).await;
|
||||
|
||||
let mut first = true;
|
||||
loop {
|
||||
tokio::select! {
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
pub(crate) mod analysis;
|
||||
mod compaction;
|
||||
pub mod delete;
|
||||
pub(crate) mod detach_ancestor;
|
||||
@@ -62,7 +61,6 @@ use std::{
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::metrics::GetKind;
|
||||
use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
@@ -76,6 +74,7 @@ use crate::{
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
};
|
||||
use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
|
||||
use crate::{
|
||||
disk_usage_eviction_task::finite_f32,
|
||||
tenant::storage_layer::{
|
||||
@@ -205,6 +204,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
/// The outward-facing resources required to build a Timeline
|
||||
pub struct TimelineResources {
|
||||
pub remote_client: RemoteTimelineClient,
|
||||
pub deletion_queue_client: DeletionQueueClient,
|
||||
pub timeline_get_throttle: Arc<
|
||||
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
|
||||
>,
|
||||
@@ -321,8 +321,6 @@ pub struct Timeline {
|
||||
/// Locked automatically by [`TimelineWriter`] and checkpointer.
|
||||
/// Must always be acquired before the layer map/individual layer lock
|
||||
/// to avoid deadlock.
|
||||
///
|
||||
/// The state is cleared upon freezing.
|
||||
write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
|
||||
|
||||
/// Used to avoid multiple `flush_loop` tasks running
|
||||
@@ -425,14 +423,6 @@ pub struct Timeline {
|
||||
|
||||
/// Indicate whether aux file v2 storage is enabled.
|
||||
pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
|
||||
|
||||
/// Some test cases directly place keys into the timeline without actually modifying the directory
|
||||
/// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
|
||||
/// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
|
||||
/// keyspace return value of `collect_keyspace`. For sparse keyspaces, use AUX keys for testing, and
|
||||
/// in the future, add `extra_test_sparse_keyspace` if necessary.
|
||||
#[cfg(test)]
|
||||
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
|
||||
}
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
@@ -454,12 +444,6 @@ pub(crate) struct GcInfo {
|
||||
|
||||
/// The cutoff coordinates, which are combined by selecting the minimum.
|
||||
pub(crate) cutoffs: GcCutoffs,
|
||||
|
||||
// TODO(yuchen): If we decide to incorporate `retain_lsns` into the same structure,
|
||||
// this would look something like `BTreeMap<Lsn, Option<LsnLease>>`, where
|
||||
// `None` suggests an infinite lease (will be used by current retain_lsns).
|
||||
/// Leases granted to particular LSNs.
|
||||
pub(crate) leases: BTreeMap<Lsn, LsnLease>,
|
||||
}
|
||||
|
||||
impl GcInfo {
|
||||
@@ -1564,34 +1548,14 @@ impl Timeline {
|
||||
/// Obtains a temporary lease blocking garbage collection for the given LSN
|
||||
pub(crate) fn make_lsn_lease(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
length: Duration,
|
||||
_lsn: Lsn,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<LsnLease> {
|
||||
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
|
||||
}
|
||||
|
||||
let mut lease = LsnLease {
|
||||
valid_until: SystemTime::now() + length,
|
||||
const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60);
|
||||
let lease = LsnLease {
|
||||
valid_until: SystemTime::now() + LEASE_LENGTH,
|
||||
};
|
||||
|
||||
{
|
||||
let mut gc_info = self.gc_info.write().unwrap();
|
||||
gc_info
|
||||
.leases
|
||||
.entry(lsn)
|
||||
.and_modify(|existing| {
|
||||
// Insert a lease only if it extends longer than the existing one.
|
||||
if lease.valid_until > existing.valid_until {
|
||||
*existing = lease.clone();
|
||||
} else {
|
||||
lease = existing.clone();
|
||||
}
|
||||
})
|
||||
.or_insert(lease.clone());
|
||||
}
|
||||
// TODO: dummy implementation
|
||||
Ok(lease)
|
||||
}
|
||||
|
||||
@@ -1604,15 +1568,7 @@ impl Timeline {
|
||||
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
|
||||
// polluting the span hierarchy.
|
||||
pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
|
||||
let to_lsn = {
|
||||
// Freeze the current open in-memory layer. It will be written to disk on next
|
||||
// iteration.
|
||||
let mut g = self.write_lock.lock().await;
|
||||
|
||||
let to_lsn = self.get_last_record_lsn();
|
||||
self.freeze_inmem_layer_at(to_lsn, &mut g).await;
|
||||
to_lsn
|
||||
};
|
||||
let to_lsn = self.freeze_inmem_layer(false).await;
|
||||
self.flush_frozen_layers_and_wait(to_lsn).await
|
||||
}
|
||||
|
||||
@@ -1621,7 +1577,7 @@ impl Timeline {
|
||||
// an ephemeral layer open forever when idle. It also freezes layers if the global limit on
|
||||
// ephemeral layer bytes has been breached.
|
||||
pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
|
||||
let Ok(mut write_guard) = self.write_lock.try_lock() else {
|
||||
let Ok(_write_guard) = self.write_lock.try_lock() else {
|
||||
// If the write lock is held, there is an active wal receiver: rolling open layers
|
||||
// is their responsibility while they hold this lock.
|
||||
return;
|
||||
@@ -1698,35 +1654,24 @@ impl Timeline {
|
||||
self.last_freeze_at.load(),
|
||||
open_layer.get_opened_at(),
|
||||
) {
|
||||
let at_lsn = match open_layer.info() {
|
||||
match open_layer.info() {
|
||||
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
|
||||
// We may reach this point if the layer was already frozen by not yet flushed: flushing
|
||||
// happens asynchronously in the background.
|
||||
tracing::debug!(
|
||||
"Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
|
||||
);
|
||||
None
|
||||
}
|
||||
InMemoryLayerInfo::Open { .. } => {
|
||||
// Upgrade to a write lock and freeze the layer
|
||||
drop(layers_guard);
|
||||
let mut layers_guard = self.layers.write().await;
|
||||
let froze = layers_guard
|
||||
.try_freeze_in_memory_layer(
|
||||
current_lsn,
|
||||
&self.last_freeze_at,
|
||||
&mut write_guard,
|
||||
)
|
||||
layers_guard
|
||||
.try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
|
||||
.await;
|
||||
Some(current_lsn).filter(|_| froze)
|
||||
}
|
||||
};
|
||||
if let Some(lsn) = at_lsn {
|
||||
let res: Result<u64, _> = self.flush_frozen_layers(lsn);
|
||||
if let Err(e) = res {
|
||||
tracing::info!("failed to flush frozen layer after background freeze: {e:#}");
|
||||
}
|
||||
}
|
||||
self.flush_frozen_layers();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2090,11 +2035,11 @@ impl Timeline {
|
||||
true
|
||||
} else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
|
||||
projected_lsn,
|
||||
layer_size,
|
||||
opened_at.elapsed()
|
||||
);
|
||||
"Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
|
||||
projected_lsn,
|
||||
layer_size,
|
||||
opened_at.elapsed()
|
||||
);
|
||||
|
||||
true
|
||||
} else {
|
||||
@@ -2377,9 +2322,6 @@ impl Timeline {
|
||||
aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
|
||||
|
||||
last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
|
||||
|
||||
#[cfg(test)]
|
||||
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
|
||||
};
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
@@ -2438,7 +2380,7 @@ impl Timeline {
|
||||
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
|
||||
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
|
||||
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
|
||||
assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
|
||||
assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
|
||||
*flush_loop_state = FlushLoopState::Exited;
|
||||
Ok(())
|
||||
}
|
||||
@@ -3701,21 +3643,28 @@ impl Timeline {
|
||||
self.last_record_lsn.advance(new_lsn);
|
||||
}
|
||||
|
||||
async fn freeze_inmem_layer_at(
|
||||
&self,
|
||||
at: Lsn,
|
||||
write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
|
||||
) {
|
||||
let frozen = {
|
||||
let mut guard = self.layers.write().await;
|
||||
guard
|
||||
.try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
|
||||
.await
|
||||
/// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
|
||||
/// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
|
||||
async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
|
||||
// Freeze the current open in-memory layer. It will be written to disk on next
|
||||
// iteration.
|
||||
|
||||
let _write_guard = if write_lock_held {
|
||||
None
|
||||
} else {
|
||||
Some(self.write_lock.lock().await)
|
||||
};
|
||||
if frozen {
|
||||
let now = Instant::now();
|
||||
*(self.last_freeze_ts.write().unwrap()) = now;
|
||||
}
|
||||
|
||||
let to_lsn = self.get_last_record_lsn();
|
||||
self.freeze_inmem_layer_at(to_lsn).await;
|
||||
to_lsn
|
||||
}
|
||||
|
||||
async fn freeze_inmem_layer_at(&self, at: Lsn) {
|
||||
let mut guard = self.layers.write().await;
|
||||
guard
|
||||
.try_freeze_in_memory_layer(at, &self.last_freeze_at)
|
||||
.await;
|
||||
}
|
||||
|
||||
/// Layer flusher task's main loop.
|
||||
@@ -3809,14 +3758,18 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk.
|
||||
/// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`].
|
||||
/// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
|
||||
/// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
|
||||
///
|
||||
/// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the
|
||||
/// case, it means no data will be written between the top of the highest frozen layer and
|
||||
/// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data
|
||||
/// locally for that part of the WAL.
|
||||
fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result<u64, FlushLayerError> {
|
||||
/// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
|
||||
/// it means no data will be written between the top of the highest frozen layer and to_lsn,
|
||||
/// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
|
||||
async fn flush_frozen_layers_and_wait(
|
||||
&self,
|
||||
last_record_lsn: Lsn,
|
||||
) -> Result<(), FlushLayerError> {
|
||||
let mut rx = self.layer_flush_done_tx.subscribe();
|
||||
|
||||
// Increment the flush cycle counter and wake up the flush task.
|
||||
// Remember the new value, so that when we listen for the flush
|
||||
// to finish, we know when the flush that we initiated has
|
||||
@@ -3831,18 +3784,13 @@ impl Timeline {
|
||||
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
|
||||
my_flush_request = *counter + 1;
|
||||
*counter = my_flush_request;
|
||||
*lsn = std::cmp::max(at_lsn, *lsn);
|
||||
*lsn = std::cmp::max(last_record_lsn, *lsn);
|
||||
});
|
||||
|
||||
Ok(my_flush_request)
|
||||
}
|
||||
|
||||
async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> {
|
||||
let mut rx = self.layer_flush_done_tx.subscribe();
|
||||
loop {
|
||||
{
|
||||
let (last_result_counter, last_result) = &*rx.borrow();
|
||||
if *last_result_counter >= request {
|
||||
if *last_result_counter >= my_flush_request {
|
||||
if let Err(err) = last_result {
|
||||
// We already logged the original error in
|
||||
// flush_loop. We cannot propagate it to the caller
|
||||
@@ -3869,9 +3817,12 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> {
|
||||
let token = self.flush_frozen_layers(at_lsn)?;
|
||||
self.wait_flush_completion(token).await
|
||||
fn flush_frozen_layers(&self) {
|
||||
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
|
||||
*counter += 1;
|
||||
|
||||
*lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
|
||||
});
|
||||
}
|
||||
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
@@ -4495,12 +4446,6 @@ impl Timeline {
|
||||
if mode == ImageLayerCreationMode::Initial {
|
||||
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
|
||||
}
|
||||
if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
|
||||
// Skip compaction if there are not enough updates. Metadata compaction will do a scan and
|
||||
// might mess up with evictions.
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
} else if let ImageLayerCreationMode::Try = mode {
|
||||
// check_for_image_layers = false -> skip
|
||||
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
|
||||
@@ -4848,7 +4793,7 @@ impl Timeline {
|
||||
pitr: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<GcCutoffs, PageReconstructError> {
|
||||
) -> anyhow::Result<GcCutoffs> {
|
||||
let _timer = self
|
||||
.metrics
|
||||
.find_gc_cutoffs_histo
|
||||
@@ -4933,23 +4878,13 @@ impl Timeline {
|
||||
return Err(GcError::TimelineCancelled);
|
||||
}
|
||||
|
||||
let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
|
||||
let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
|
||||
let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
|
||||
let pitr_cutoff = gc_info.cutoffs.pitr;
|
||||
let retain_lsns = gc_info.retain_lsns.clone();
|
||||
|
||||
// Gets the minimum LSN that holds the valid lease.
|
||||
// Caveat: This value could be stale since we rely on refresh_gc_info to invalidate leases,
|
||||
// so there could be leases invalidated between the refresh and here.
|
||||
let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);
|
||||
(
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
retain_lsns,
|
||||
max_lsn_with_valid_lease,
|
||||
)
|
||||
(horizon_cutoff, pitr_cutoff, retain_lsns)
|
||||
};
|
||||
|
||||
let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
|
||||
@@ -4980,13 +4915,7 @@ impl Timeline {
|
||||
.set(Lsn::INVALID.0 as i64);
|
||||
|
||||
let res = self
|
||||
.gc_timeline(
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
retain_lsns,
|
||||
new_gc_cutoff,
|
||||
max_lsn_with_valid_lease,
|
||||
)
|
||||
.gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
|
||||
.instrument(
|
||||
info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff),
|
||||
)
|
||||
@@ -5004,7 +4933,6 @@ impl Timeline {
|
||||
pitr_cutoff: Lsn,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
new_gc_cutoff: Lsn,
|
||||
max_lsn_with_valid_lease: Option<Lsn>,
|
||||
) -> Result<GcResult, GcError> {
|
||||
// FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
|
||||
|
||||
@@ -5052,8 +4980,7 @@ impl Timeline {
|
||||
// 1. it is older than cutoff LSN;
|
||||
// 2. it is older than PITR interval;
|
||||
// 3. it doesn't need to be retained for 'retain_lsns';
|
||||
// 4. it does not need to be kept for LSNs holding valid leases (logic is very similar to retain_lsns);
|
||||
// 5. newer on-disk image layers cover the layer's whole key range
|
||||
// 4. newer on-disk image layers cover the layer's whole key range
|
||||
//
|
||||
// TODO holding a write lock is too agressive and avoidable
|
||||
let mut guard = self.layers.write().await;
|
||||
@@ -5104,20 +5031,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Is there a valid lease that requires us to keep this layer?
|
||||
if let Some(lsn) = &max_lsn_with_valid_lease {
|
||||
if &l.get_lsn_range().start <= lsn {
|
||||
debug!(
|
||||
"keeping {} because there is a valid lease preventing GC at {}",
|
||||
l.layer_name(),
|
||||
lsn,
|
||||
);
|
||||
result.layers_needed_by_leases += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Is there a later on-disk layer for this relation?
|
||||
// 4. Is there a later on-disk layer for this relation?
|
||||
//
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
@@ -5139,13 +5053,13 @@ impl Timeline {
|
||||
if !layers
|
||||
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
|
||||
{
|
||||
info!("keeping {} because it is the latest layer", l.layer_name());
|
||||
debug!("keeping {} because it is the latest layer", l.layer_name());
|
||||
result.layers_not_updated += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
// We didn't find any reason to keep this file, so remove it.
|
||||
info!(
|
||||
debug!(
|
||||
"garbage collecting {} is_dropped: xx is_incremental: {}",
|
||||
l.layer_name(),
|
||||
l.is_incremental(),
|
||||
@@ -5495,11 +5409,6 @@ impl Timeline {
|
||||
self.last_record_lsn.advance(new_lsn);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(super) fn force_set_disk_consistent_lsn(&self, new_value: Lsn) {
|
||||
self.disk_consistent_lsn.store(new_value);
|
||||
}
|
||||
|
||||
/// Force create an image layer and place it into the layer map.
|
||||
///
|
||||
/// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
|
||||
@@ -5621,33 +5530,10 @@ impl Timeline {
|
||||
all_data.sort();
|
||||
Ok(all_data)
|
||||
}
|
||||
|
||||
/// Get all historic layer descriptors in the layer map
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn inspect_historic_layers(
|
||||
self: &Arc<Timeline>,
|
||||
) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
|
||||
let mut layers = Vec::new();
|
||||
let guard = self.layers.read().await;
|
||||
for layer in guard.layer_map().iter_historic_layers() {
|
||||
layers.push(layer.key());
|
||||
}
|
||||
Ok(layers)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn add_extra_test_dense_keyspace(&self, ks: KeySpace) {
|
||||
let mut keyspace = self.extra_test_dense_keyspace.load().as_ref().clone();
|
||||
keyspace.merge(&ks);
|
||||
self.extra_test_dense_keyspace.store(Arc::new(keyspace));
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
|
||||
|
||||
/// Tracking writes ingestion does to a particular in-memory layer.
|
||||
///
|
||||
/// Cleared upon freezing a layer.
|
||||
struct TimelineWriterState {
|
||||
open_layer: Arc<InMemoryLayer>,
|
||||
current_size: u64,
|
||||
@@ -5688,6 +5574,12 @@ impl Deref for TimelineWriter<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TimelineWriter<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.write_guard.take();
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum OpenLayerAction {
|
||||
Roll,
|
||||
@@ -5770,16 +5662,17 @@ impl<'a> TimelineWriter<'a> {
|
||||
}
|
||||
|
||||
async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
|
||||
assert!(self.write_guard.is_some());
|
||||
|
||||
self.tl.freeze_inmem_layer_at(freeze_at).await;
|
||||
|
||||
let now = Instant::now();
|
||||
*(self.last_freeze_ts.write().unwrap()) = now;
|
||||
|
||||
self.tl.flush_frozen_layers();
|
||||
|
||||
let current_size = self.write_guard.as_ref().unwrap().current_size;
|
||||
|
||||
// self.write_guard will be taken by the freezing
|
||||
self.tl
|
||||
.freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
|
||||
.await;
|
||||
|
||||
self.tl.flush_frozen_layers(freeze_at)?;
|
||||
|
||||
if current_size >= self.get_checkpoint_distance() * 2 {
|
||||
if current_size > self.get_checkpoint_distance() {
|
||||
warn!("Flushed oversized open layer with size {}", current_size)
|
||||
}
|
||||
|
||||
@@ -5792,27 +5685,9 @@ impl<'a> TimelineWriter<'a> {
|
||||
return OpenLayerAction::Open;
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
if state.cached_last_freeze_at < self.tl.last_freeze_at.load() {
|
||||
// this check and assertion are not really needed because
|
||||
// LayerManager::try_freeze_in_memory_layer will always clear out the
|
||||
// TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there
|
||||
// is no TimelineWriterState.
|
||||
assert!(
|
||||
state.open_layer.end_lsn.get().is_some(),
|
||||
"our open_layer must be outdated"
|
||||
);
|
||||
|
||||
// this would be a memory leak waiting to happen because the in-memory layer always has
|
||||
// an index
|
||||
panic!("BUG: TimelineWriterState held on to frozen in-memory layer.");
|
||||
}
|
||||
|
||||
if state.prev_lsn == Some(lsn) {
|
||||
// Rolling mid LSN is not supported by [downstream code].
|
||||
// Rolling mid LSN is not supported by downstream code.
|
||||
// Hence, only roll at LSN boundaries.
|
||||
//
|
||||
// [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422
|
||||
return OpenLayerAction::None;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,90 +0,0 @@
|
||||
use std::{collections::BTreeSet, ops::Range};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
pub(crate) struct RangeAnalysis {
|
||||
start: String,
|
||||
end: String,
|
||||
has_image: bool,
|
||||
num_of_deltas_above_image: usize,
|
||||
total_num_of_deltas: usize,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
pub(crate) async fn perf_info(&self) -> Vec<RangeAnalysis> {
|
||||
// First, collect all split points of the layers.
|
||||
let mut split_points = BTreeSet::new();
|
||||
let mut delta_ranges = Vec::new();
|
||||
let mut image_ranges = Vec::new();
|
||||
|
||||
let all_layer_files = {
|
||||
let guard = self.layers.read().await;
|
||||
guard.all_persistent_layers()
|
||||
};
|
||||
let lsn = self.get_last_record_lsn();
|
||||
|
||||
for key in all_layer_files {
|
||||
split_points.insert(key.key_range.start);
|
||||
split_points.insert(key.key_range.end);
|
||||
if key.is_delta {
|
||||
delta_ranges.push((key.key_range.clone(), key.lsn_range.clone()));
|
||||
} else {
|
||||
image_ranges.push((key.key_range.clone(), key.lsn_range.start));
|
||||
}
|
||||
}
|
||||
|
||||
// For each split range, compute the estimated read amplification.
|
||||
let split_points = split_points.into_iter().collect::<Vec<_>>();
|
||||
|
||||
let mut result = Vec::new();
|
||||
|
||||
for i in 0..(split_points.len() - 1) {
|
||||
let start = split_points[i];
|
||||
let end = split_points[i + 1];
|
||||
// Find the latest image layer that contains the information.
|
||||
let mut maybe_image_layers = image_ranges
|
||||
.iter()
|
||||
// We insert split points for all image layers, and therefore a `contains` check for the start point should be enough.
|
||||
.filter(|(key_range, img_lsn)| key_range.contains(&start) && img_lsn <= &lsn)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
maybe_image_layers.sort_by(|a, b| a.1.cmp(&b.1));
|
||||
let image_layer = maybe_image_layers.last().cloned();
|
||||
let lsn_filter_start = image_layer
|
||||
.as_ref()
|
||||
.map(|(_, lsn)| *lsn)
|
||||
.unwrap_or(Lsn::INVALID);
|
||||
|
||||
fn overlaps_with(lsn_range_a: &Range<Lsn>, lsn_range_b: &Range<Lsn>) -> bool {
|
||||
!(lsn_range_a.end <= lsn_range_b.start || lsn_range_a.start >= lsn_range_b.end)
|
||||
}
|
||||
|
||||
let maybe_delta_layers = delta_ranges
|
||||
.iter()
|
||||
.filter(|(key_range, lsn_range)| {
|
||||
key_range.contains(&start) && overlaps_with(&(lsn_filter_start..lsn), lsn_range)
|
||||
})
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let pitr_delta_layers = delta_ranges
|
||||
.iter()
|
||||
.filter(|(key_range, _)| key_range.contains(&start))
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
result.push(RangeAnalysis {
|
||||
start: start.to_string(),
|
||||
end: end.to_string(),
|
||||
has_image: image_layer.is_some(),
|
||||
num_of_deltas_above_image: maybe_delta_layers.len(),
|
||||
total_num_of_deltas: pitr_delta_layers.len(),
|
||||
});
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
@@ -952,178 +952,6 @@ impl Timeline {
|
||||
adaptor.flush_updates().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// An experimental compaction building block that combines compaction with garbage collection.
|
||||
///
|
||||
/// The current implementation picks all delta + image layers that are below or intersecting with
|
||||
/// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
|
||||
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
|
||||
/// and create delta layers with all deltas >= gc horizon.
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
_cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
use crate::tenant::storage_layer::ValueReconstructState;
|
||||
// Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
|
||||
// The layer selection has the following properties:
|
||||
// 1. If a layer is in the selection, all layers below it are in the selection.
|
||||
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
|
||||
let (layer_selection, gc_cutoff) = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
|
||||
let mut selected_layers = Vec::new();
|
||||
// TODO: consider retain_lsns
|
||||
drop(gc_info);
|
||||
for desc in layers.iter_historic_layers() {
|
||||
if desc.get_lsn_range().start <= gc_cutoff {
|
||||
selected_layers.push(guard.get_from_desc(&desc));
|
||||
}
|
||||
}
|
||||
(selected_layers, gc_cutoff)
|
||||
};
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
let mut all_key_values = Vec::new();
|
||||
for layer in &layer_selection {
|
||||
all_key_values.extend(layer.load_key_values(ctx).await?);
|
||||
}
|
||||
// Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
|
||||
// image layers, make image appear later than delta.
|
||||
struct ValueWrapper<'a>(&'a crate::repository::Value);
|
||||
impl Ord for ValueWrapper<'_> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
use crate::repository::Value;
|
||||
use std::cmp::Ordering;
|
||||
match (self.0, other.0) {
|
||||
(Value::Image(_), Value::WalRecord(_)) => Ordering::Greater,
|
||||
(Value::WalRecord(_), Value::Image(_)) => Ordering::Less,
|
||||
_ => Ordering::Equal,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl PartialOrd for ValueWrapper<'_> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl PartialEq for ValueWrapper<'_> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == std::cmp::Ordering::Equal
|
||||
}
|
||||
}
|
||||
impl Eq for ValueWrapper<'_> {}
|
||||
all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
|
||||
(k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
|
||||
});
|
||||
let max_lsn = all_key_values
|
||||
.iter()
|
||||
.map(|(_, lsn, _)| lsn)
|
||||
.max()
|
||||
.copied()
|
||||
.unwrap()
|
||||
+ 1;
|
||||
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
|
||||
// Data of the same key.
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
|
||||
|
||||
/// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
|
||||
async fn flush_accumulated_states(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
|
||||
horizon: Lsn,
|
||||
) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
|
||||
let mut base_image = None;
|
||||
let mut keys_above_horizon = Vec::new();
|
||||
let mut delta_above_base_image = Vec::new();
|
||||
// We have a list of deltas/images. We want to create image layers while collect garbages.
|
||||
for (key, lsn, val) in accumulated_values.iter().rev() {
|
||||
if *lsn > horizon {
|
||||
keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both
|
||||
} else if *lsn <= horizon {
|
||||
match val {
|
||||
crate::repository::Value::Image(image) => {
|
||||
if lsn <= &horizon {
|
||||
base_image = Some((*lsn, image.clone()));
|
||||
break;
|
||||
}
|
||||
}
|
||||
crate::repository::Value::WalRecord(wal) => {
|
||||
delta_above_base_image.push((*lsn, wal.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
delta_above_base_image.reverse();
|
||||
keys_above_horizon.reverse();
|
||||
let state = ValueReconstructState {
|
||||
img: base_image,
|
||||
records: delta_above_base_image,
|
||||
};
|
||||
let img = tline.reconstruct_value(key, horizon, state).await?;
|
||||
Ok((keys_above_horizon, img))
|
||||
}
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
all_key_values.first().unwrap().0,
|
||||
gc_cutoff..max_lsn, // TODO: off by one?
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
|
||||
gc_cutoff,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for item @ (key, _, _) in &all_key_values {
|
||||
if &last_key == key {
|
||||
accumulated_values.push(item);
|
||||
} else {
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
|
||||
.await?;
|
||||
image_layer_writer.put_image(last_key, image, ctx).await?;
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
accumulated_values.clear();
|
||||
accumulated_values.push(item);
|
||||
last_key = *key;
|
||||
}
|
||||
}
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
|
||||
image_layer_writer.put_image(last_key, image, ctx).await?;
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
accumulated_values.clear();
|
||||
// TODO: split layers
|
||||
let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?;
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
// Step 3: Place back to the layer map.
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.finish_gc_compaction(
|
||||
&layer_selection,
|
||||
&[delta_layer.clone(), image_layer.clone()],
|
||||
&self.metrics,
|
||||
)
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct TimelineAdaptor {
|
||||
|
||||
@@ -11,6 +11,7 @@ use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
deletion_queue::DeletionQueueClient,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
metadata::TimelineMetadata,
|
||||
@@ -262,6 +263,7 @@ impl DeleteTimelineFlow {
|
||||
timeline_id: TimelineId,
|
||||
local_metadata: &TimelineMetadata,
|
||||
remote_client: RemoteTimelineClient,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
) -> anyhow::Result<()> {
|
||||
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
|
||||
// RemoteTimelineClient is the only functioning part.
|
||||
@@ -272,6 +274,7 @@ impl DeleteTimelineFlow {
|
||||
None, // Ancestor is not needed for deletion.
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
deletion_queue_client,
|
||||
timeline_get_throttle: tenant.timeline_get_throttle.clone(),
|
||||
},
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use tracing::trace;
|
||||
@@ -21,8 +20,6 @@ use crate::{
|
||||
},
|
||||
};
|
||||
|
||||
use super::TimelineWriterState;
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct LayerManager {
|
||||
@@ -122,20 +119,18 @@ impl LayerManager {
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
/// Tries to freeze an open layer and also manages clearing the TimelineWriterState.
|
||||
///
|
||||
/// Returns true if anything was frozen.
|
||||
pub(super) async fn try_freeze_in_memory_layer(
|
||||
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
||||
pub(crate) async fn try_freeze_in_memory_layer(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
last_freeze_at: &AtomicLsn,
|
||||
write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
|
||||
) -> bool {
|
||||
) {
|
||||
let Lsn(last_record_lsn) = lsn;
|
||||
let end_lsn = Lsn(last_record_lsn + 1);
|
||||
|
||||
let froze = if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
let open_layer_rc = Arc::clone(open_layer);
|
||||
// Does this layer need freezing?
|
||||
open_layer.freeze(end_lsn).await;
|
||||
|
||||
// The layer is no longer open, update the layer map to reflect this.
|
||||
@@ -143,25 +138,11 @@ impl LayerManager {
|
||||
self.layer_map.frozen_layers.push_back(open_layer_rc);
|
||||
self.layer_map.open_layer = None;
|
||||
self.layer_map.next_open_layer_at = Some(end_lsn);
|
||||
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
}
|
||||
|
||||
// Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
|
||||
// accounts for regions in the LSN range where we might have ingested no data due to sharding.
|
||||
last_freeze_at.store(end_lsn);
|
||||
|
||||
// the writer state must no longer have a reference to the frozen layer
|
||||
let taken = write_lock.take();
|
||||
assert_eq!(
|
||||
froze,
|
||||
taken.is_some(),
|
||||
"should only had frozen a layer when TimelineWriterState existed"
|
||||
);
|
||||
|
||||
froze
|
||||
}
|
||||
|
||||
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||
@@ -226,18 +207,6 @@ impl LayerManager {
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Called when a GC-compaction is completed.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn finish_gc_compaction(
|
||||
&mut self,
|
||||
compact_from: &[Layer],
|
||||
compact_to: &[ResidentLayer],
|
||||
metrics: &TimelineMetrics,
|
||||
) {
|
||||
// We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification.
|
||||
self.finish_compact_l0(compact_from, compact_to, metrics)
|
||||
}
|
||||
|
||||
/// Called when compaction is completed.
|
||||
pub(crate) fn rewrite_layers(
|
||||
&mut self,
|
||||
@@ -339,10 +308,6 @@ impl LayerManager {
|
||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||
self.layer_fmgr.contains(layer)
|
||||
}
|
||||
|
||||
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
|
||||
self.layer_fmgr.0.keys().cloned().collect_vec()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
|
||||
|
||||
@@ -49,19 +49,6 @@ pub enum NeonWalRecord {
|
||||
file_path: String,
|
||||
content: Option<Bytes>,
|
||||
},
|
||||
|
||||
/// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
|
||||
#[cfg(test)]
|
||||
Test {
|
||||
/// Append a string to the image.
|
||||
append: String,
|
||||
/// Clear the image before appending.
|
||||
clear: bool,
|
||||
/// Treat this record as an init record. `clear` should be set to true if this field is set
|
||||
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
|
||||
/// its references in `timeline.rs`.
|
||||
will_init: bool,
|
||||
},
|
||||
}
|
||||
|
||||
impl NeonWalRecord {
|
||||
@@ -71,39 +58,11 @@ impl NeonWalRecord {
|
||||
// If you change this function, you'll also need to change ValueBytes::will_init
|
||||
match self {
|
||||
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
|
||||
#[cfg(test)]
|
||||
NeonWalRecord::Test { will_init, .. } => *will_init,
|
||||
|
||||
// None of the special neon record types currently initialize the page
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn wal_append(s: impl AsRef<str>) -> Self {
|
||||
Self::Test {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: false,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn wal_clear() -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
clear: true,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn wal_init() -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
clear: true,
|
||||
will_init: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// DecodedBkpBlock represents per-page data contained in a WAL record.
|
||||
|
||||
@@ -244,20 +244,6 @@ pub(crate) fn apply_in_neon(
|
||||
let mut writer = page.writer();
|
||||
dir.ser_into(&mut writer)?;
|
||||
}
|
||||
#[cfg(test)]
|
||||
NeonWalRecord::Test {
|
||||
append,
|
||||
clear,
|
||||
will_init,
|
||||
} => {
|
||||
if *will_init {
|
||||
assert!(*clear, "init record must be clear to ensure correctness");
|
||||
}
|
||||
if *clear {
|
||||
page.clear();
|
||||
}
|
||||
page.put_slice(append.as_bytes());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
use anyhow::Context;
|
||||
@@ -27,7 +26,6 @@ use utils::{lsn::Lsn, poison::Poison};
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
#[cfg(feature = "testing")]
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
@@ -145,7 +143,6 @@ impl WalRedoProcess {
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
#[cfg(feature = "testing")]
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: tokio::sync::Mutex::new(Poison::new(
|
||||
@@ -181,7 +178,7 @@ impl WalRedoProcess {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Cancellation safe.
|
||||
#[instrument(skip_all, fields(pid=%self.id()))]
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
@@ -190,8 +187,6 @@ impl WalRedoProcess {
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
|
||||
@@ -1,223 +0,0 @@
|
||||
commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
|
||||
Author: Alexey Masterov <alexeymasterov@neon.tech>
|
||||
Date: Fri May 31 06:34:26 2024 +0000
|
||||
|
||||
These alternative expected files were added to consider the neon features
|
||||
|
||||
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
|
||||
new file mode 100644
|
||||
index 0000000..2539cfd
|
||||
--- /dev/null
|
||||
+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
|
||||
@@ -0,0 +1,101 @@
|
||||
+BEGIN;
|
||||
+CREATE EXTENSION anon CASCADE;
|
||||
+NOTICE: installing required extension "pgcrypto"
|
||||
+SELECT anon.init();
|
||||
+ init
|
||||
+------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+CREATE ROLE mallory_the_masked_user;
|
||||
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
|
||||
+CREATE TABLE t1(i INT);
|
||||
+ALTER TABLE t1 ADD COLUMN t TEXT;
|
||||
+SECURITY LABEL FOR anon ON COLUMN t1.t
|
||||
+IS 'MASKED WITH VALUE NULL';
|
||||
+INSERT INTO t1 VALUES (1,'test');
|
||||
+--
|
||||
+-- We're checking the owner's permissions
|
||||
+--
|
||||
+-- see
|
||||
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
|
||||
+--
|
||||
+SET ROLE mallory_the_masked_user;
|
||||
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
|
||||
+ ?column?
|
||||
+----------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+-- SHOULD FAIL
|
||||
+DO $$
|
||||
+BEGIN
|
||||
+ PERFORM anon.init();
|
||||
+ EXCEPTION WHEN insufficient_privilege
|
||||
+ THEN RAISE NOTICE 'insufficient_privilege';
|
||||
+END$$;
|
||||
+NOTICE: insufficient_privilege
|
||||
+-- SHOULD FAIL
|
||||
+DO $$
|
||||
+BEGIN
|
||||
+ PERFORM anon.anonymize_table('t1');
|
||||
+ EXCEPTION WHEN insufficient_privilege
|
||||
+ THEN RAISE NOTICE 'insufficient_privilege';
|
||||
+END$$;
|
||||
+NOTICE: insufficient_privilege
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_start_engine;
|
||||
+SELECT anon.start_dynamic_masking();
|
||||
+ERROR: Only supersusers can start the dynamic masking engine.
|
||||
+CONTEXT: PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
|
||||
+ROLLBACK TO fail_start_engine;
|
||||
+RESET ROLE;
|
||||
+SELECT anon.start_dynamic_masking();
|
||||
+ start_dynamic_masking
|
||||
+-----------------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+SET ROLE mallory_the_masked_user;
|
||||
+SELECT * FROM mask.t1;
|
||||
+ i | t
|
||||
+---+---
|
||||
+ 1 |
|
||||
+(1 row)
|
||||
+
|
||||
+-- SHOULD FAIL
|
||||
+DO $$
|
||||
+BEGIN
|
||||
+ SELECT * FROM public.t1;
|
||||
+ EXCEPTION WHEN insufficient_privilege
|
||||
+ THEN RAISE NOTICE 'insufficient_privilege';
|
||||
+END$$;
|
||||
+NOTICE: insufficient_privilege
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_stop_engine;
|
||||
+SELECT anon.stop_dynamic_masking();
|
||||
+ERROR: Only supersusers can stop the dynamic masking engine.
|
||||
+CONTEXT: PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
|
||||
+ROLLBACK TO fail_stop_engine;
|
||||
+RESET ROLE;
|
||||
+SELECT anon.stop_dynamic_masking();
|
||||
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
|
||||
+ stop_dynamic_masking
|
||||
+----------------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+SET ROLE mallory_the_masked_user;
|
||||
+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
|
||||
+ ?column?
|
||||
+----------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_seclabel_on_role;
|
||||
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
|
||||
+ERROR: permission denied
|
||||
+DETAIL: The current user must have the CREATEROLE attribute.
|
||||
+ROLLBACK TO fail_seclabel_on_role;
|
||||
+ROLLBACK;
|
||||
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
|
||||
new file mode 100644
|
||||
index 0000000..8b090fe
|
||||
--- /dev/null
|
||||
+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
|
||||
@@ -0,0 +1,104 @@
|
||||
+BEGIN;
|
||||
+CREATE EXTENSION anon CASCADE;
|
||||
+NOTICE: installing required extension "pgcrypto"
|
||||
+SELECT anon.init();
|
||||
+ init
|
||||
+------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+CREATE ROLE oscar_the_owner;
|
||||
+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
|
||||
+CREATE ROLE mallory_the_masked_user;
|
||||
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
|
||||
+--
|
||||
+-- We're checking the owner's permissions
|
||||
+--
|
||||
+-- see
|
||||
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
|
||||
+--
|
||||
+SET ROLE oscar_the_owner;
|
||||
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
|
||||
+ ?column?
|
||||
+----------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+-- SHOULD FAIL
|
||||
+DO $$
|
||||
+BEGIN
|
||||
+ PERFORM anon.init();
|
||||
+ EXCEPTION WHEN insufficient_privilege
|
||||
+ THEN RAISE NOTICE 'insufficient_privilege';
|
||||
+END$$;
|
||||
+NOTICE: insufficient_privilege
|
||||
+CREATE TABLE t1(i INT);
|
||||
+ALTER TABLE t1 ADD COLUMN t TEXT;
|
||||
+SECURITY LABEL FOR anon ON COLUMN t1.t
|
||||
+IS 'MASKED WITH VALUE NULL';
|
||||
+INSERT INTO t1 VALUES (1,'test');
|
||||
+SELECT anon.anonymize_table('t1');
|
||||
+ anonymize_table
|
||||
+-----------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+SELECT * FROM t1;
|
||||
+ i | t
|
||||
+---+---
|
||||
+ 1 |
|
||||
+(1 row)
|
||||
+
|
||||
+UPDATE t1 SET t='test' WHERE i=1;
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_start_engine;
|
||||
+SELECT anon.start_dynamic_masking();
|
||||
+ start_dynamic_masking
|
||||
+-----------------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+ROLLBACK TO fail_start_engine;
|
||||
+RESET ROLE;
|
||||
+SELECT anon.start_dynamic_masking();
|
||||
+ start_dynamic_masking
|
||||
+-----------------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+SET ROLE oscar_the_owner;
|
||||
+SELECT * FROM t1;
|
||||
+ i | t
|
||||
+---+------
|
||||
+ 1 | test
|
||||
+(1 row)
|
||||
+
|
||||
+--SELECT * FROM mask.t1;
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_stop_engine;
|
||||
+SELECT anon.stop_dynamic_masking();
|
||||
+ERROR: permission denied for schema mask
|
||||
+CONTEXT: SQL statement "DROP VIEW mask.t1;"
|
||||
+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
|
||||
+SQL statement "SELECT anon.mask_drop_view(oid)
|
||||
+ FROM pg_catalog.pg_class
|
||||
+ WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
|
||||
+ AND relkind IN ('r','p','f')"
|
||||
+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
|
||||
+ROLLBACK TO fail_stop_engine;
|
||||
+RESET ROLE;
|
||||
+SELECT anon.stop_dynamic_masking();
|
||||
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
|
||||
+ stop_dynamic_masking
|
||||
+----------------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+SET ROLE oscar_the_owner;
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_seclabel_on_role;
|
||||
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
|
||||
+ERROR: permission denied
|
||||
+DETAIL: The current user must have the CREATEROLE attribute.
|
||||
+ROLLBACK TO fail_seclabel_on_role;
|
||||
+ROLLBACK;
|
||||
@@ -1,19 +0,0 @@
|
||||
commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master)
|
||||
Author: Alexey Masterov <alexeymasterov@neon.tech>
|
||||
Date: Fri Jun 7 19:23:42 2024 +0000
|
||||
|
||||
Disable REGRESS_OPTIONS causing initdb
|
||||
|
||||
diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile
|
||||
index 053314c..fbd5fb5 100644
|
||||
--- a/ext-src/pg_cron-src/Makefile
|
||||
+++ b/ext-src/pg_cron-src/Makefile
|
||||
@@ -5,7 +5,7 @@ EXTENSION = pg_cron
|
||||
DATA_built = $(EXTENSION)--1.0.sql
|
||||
DATA = $(wildcard $(EXTENSION)--*--*.sql)
|
||||
|
||||
-REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
|
||||
+#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
|
||||
REGRESS = pg_cron-test
|
||||
|
||||
# compilation configuration
|
||||
@@ -1,39 +0,0 @@
|
||||
commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
|
||||
Author: Alexey Masterov <alexeymasterov@neon.tech>
|
||||
Date: Thu Jun 6 08:02:42 2024 +0000
|
||||
|
||||
Patch expected files to consider Neon's log messages
|
||||
|
||||
diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
index da723b8..f8d0102 100644
|
||||
--- a/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
@@ -9,13 +9,16 @@ SET search_path TO public;
|
||||
----
|
||||
-- No.A-1-1-3
|
||||
CREATE EXTENSION pg_hint_plan;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
-- No.A-1-2-3
|
||||
DROP EXTENSION pg_hint_plan;
|
||||
-- No.A-1-1-4
|
||||
CREATE SCHEMA other_schema;
|
||||
CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan"
|
||||
CREATE EXTENSION pg_hint_plan;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
DROP SCHEMA other_schema;
|
||||
----
|
||||
---- No. A-5-1 comment pattern
|
||||
diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
index d372459..6282afe 100644
|
||||
--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
|
||||
SET client_min_messages TO LOG;
|
||||
SET pg_hint_plan.enable_hint TO on;
|
||||
CREATE EXTENSION file_fdw;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
|
||||
CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
|
||||
CREATE USER MAPPING FOR PUBLIC SERVER file_server;
|
||||
CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
|
||||
@@ -1,8 +1,19 @@
|
||||
From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
|
||||
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
|
||||
Date: Fri, 2 Feb 2024 22:26:45 +0200
|
||||
Subject: [PATCH 1/1] Make v0.6.0 work with Neon
|
||||
|
||||
Now that the WAL-logging happens as a separate step at the end of the
|
||||
build, we need a few neon-specific hints to make it work.
|
||||
---
|
||||
src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 36 insertions(+)
|
||||
|
||||
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
|
||||
index dcfb2bd..d5189ee 100644
|
||||
index 680789b..ec54dea 100644
|
||||
--- a/src/hnswbuild.c
|
||||
+++ b/src/hnswbuild.c
|
||||
@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
|
||||
@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
|
||||
|
||||
hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
|
||||
|
||||
@@ -20,7 +31,7 @@ index dcfb2bd..d5189ee 100644
|
||||
/* Close relations within worker */
|
||||
index_close(indexRel, indexLockmode);
|
||||
table_close(heapRel, heapLockmode);
|
||||
@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
|
||||
@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
|
||||
SeedRandom(42);
|
||||
#endif
|
||||
|
||||
@@ -32,13 +43,14 @@ index dcfb2bd..d5189ee 100644
|
||||
|
||||
BuildGraph(buildstate, forkNum);
|
||||
|
||||
- if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
+ if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
|
||||
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
|
||||
if (RelationNeedsWAL(index))
|
||||
+ {
|
||||
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ {
|
||||
+#if PG_VERSION_NUM >= 160000
|
||||
@@ -48,7 +60,7 @@ index dcfb2bd..d5189ee 100644
|
||||
+#endif
|
||||
+
|
||||
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
|
||||
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
|
||||
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
|
||||
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
|
||||
+ }
|
||||
+#endif
|
||||
@@ -57,6 +69,10 @@ index dcfb2bd..d5189ee 100644
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_end_unlogged_build(RelationGetSmgr(index));
|
||||
+#endif
|
||||
|
||||
+
|
||||
FreeBuildState(buildstate);
|
||||
}
|
||||
|
||||
--
|
||||
2.39.2
|
||||
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
;; see also src/tools/editors/emacs.samples for more complete settings
|
||||
|
||||
((c-mode . ((c-basic-offset . 4)
|
||||
(c-file-style . "bsd")
|
||||
(fill-column . 78)
|
||||
(indent-tabs-mode . t)
|
||||
(tab-width . 4)))
|
||||
(nxml-mode . ((fill-column . 78)
|
||||
(indent-tabs-mode . nil)))
|
||||
(perl-mode . ((perl-indent-level . 4)
|
||||
(perl-continued-statement-offset . 2)
|
||||
(perl-continued-brace-offset . -2)
|
||||
(perl-brace-offset . 0)
|
||||
(perl-brace-imaginary-offset . 0)
|
||||
(perl-label-offset . -2)
|
||||
(indent-tabs-mode . t)
|
||||
(tab-width . 4)))
|
||||
(sgml-mode . ((fill-column . 78)
|
||||
(indent-tabs-mode . nil))))
|
||||
@@ -1,14 +0,0 @@
|
||||
root = true
|
||||
|
||||
[*.{c,h,l,y,pl,pm}]
|
||||
indent_style = tab
|
||||
indent_size = tab
|
||||
tab_width = 4
|
||||
|
||||
[*.{sgml,xml}]
|
||||
indent_style = space
|
||||
indent_size = 1
|
||||
|
||||
[*.xsl]
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
@@ -19,7 +19,6 @@
|
||||
#include "catalog/pg_type.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "replication/logical.h"
|
||||
#include "replication/slot.h"
|
||||
#include "replication/walsender.h"
|
||||
#include "storage/procsignal.h"
|
||||
@@ -281,7 +280,6 @@ _PG_init(void)
|
||||
pg_init_libpagestore();
|
||||
pg_init_walproposer();
|
||||
WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
|
||||
InitLogicalReplicationMonitor();
|
||||
|
||||
|
||||
@@ -605,7 +605,7 @@ prefetch_read(PrefetchRequest *slot)
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_shard_log(slot->shard_no, LOG,
|
||||
neon_shard_log(slot->shard_no, WARNING,
|
||||
"No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
|
||||
(long)slot->my_ring_index,
|
||||
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
|
||||
@@ -3112,12 +3112,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
||||
request_lsn = UINT64_MAX;
|
||||
|
||||
/*
|
||||
* GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU
|
||||
* GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
|
||||
* segment has not changed since the basebackup, because in order to
|
||||
* modify it, we would have had to download it already. And once
|
||||
* downloaded, we never evict SLRU segments from local disk.
|
||||
*/
|
||||
not_modified_since = nm_adjust_lsn(GetRedoStartLsn());
|
||||
not_modified_since = GetRedoStartLsn();
|
||||
|
||||
SlruKind kind;
|
||||
|
||||
|
||||
@@ -24,12 +24,8 @@
|
||||
#include "walproposer.h"
|
||||
|
||||
static NeonWALReader *wal_reader = NULL;
|
||||
|
||||
struct WalSnd;
|
||||
extern struct WalSnd *MyWalSnd;
|
||||
extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
|
||||
extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
|
||||
extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
|
||||
|
||||
static XLogRecPtr
|
||||
NeonWALReadWaitForWAL(XLogRecPtr loc)
|
||||
@@ -40,28 +36,7 @@ NeonWALReadWaitForWAL(XLogRecPtr loc)
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
|
||||
// Walsender sends keepalives and stuff, so better use its normal wait
|
||||
if (MyWalSnd != NULL)
|
||||
return WalSndWaitForWal(loc);
|
||||
|
||||
for (;;)
|
||||
{
|
||||
XLogRecPtr flush_ptr;
|
||||
if (!RecoveryInProgress())
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
flush_ptr = GetFlushRecPtr(NULL);
|
||||
#else
|
||||
flush_ptr = GetFlushRecPtr();
|
||||
#endif
|
||||
else
|
||||
flush_ptr = GetXLogReplayRecPtr(NULL);
|
||||
|
||||
if (loc <= flush_ptr)
|
||||
return flush_ptr;
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
pg_usleep(1000);
|
||||
}
|
||||
return WalSndWaitForWal(loc);
|
||||
}
|
||||
|
||||
static int
|
||||
|
||||
@@ -356,7 +356,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let cancel_map = CancelMap::default();
|
||||
|
||||
let redis_publisher = match ®ional_redis_client {
|
||||
let redis_publisher = match &redis_notifications_client {
|
||||
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
|
||||
redis_publisher.clone(),
|
||||
args.region.clone(),
|
||||
|
||||
@@ -1,183 +1,16 @@
|
||||
use measured::FixedCardinalityLabel;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt::{self, Display};
|
||||
use std::fmt;
|
||||
|
||||
use crate::auth::IpPattern;
|
||||
|
||||
use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
|
||||
use crate::proxy::retry::ShouldRetry;
|
||||
|
||||
/// Generic error response with human-readable description.
|
||||
/// Note that we can't always present it to user as is.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ConsoleError {
|
||||
pub error: Box<str>,
|
||||
#[serde(skip)]
|
||||
pub http_status_code: http::StatusCode,
|
||||
pub status: Option<Status>,
|
||||
}
|
||||
|
||||
impl ConsoleError {
|
||||
pub fn get_reason(&self) -> Reason {
|
||||
self.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.details.error_info.as_ref())
|
||||
.map(|e| e.reason)
|
||||
.unwrap_or(Reason::Unknown)
|
||||
}
|
||||
pub fn get_user_facing_message(&self) -> String {
|
||||
use super::provider::errors::REQUEST_FAILED;
|
||||
self.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.details.user_facing_message.as_ref())
|
||||
.map(|m| m.message.clone().into())
|
||||
.unwrap_or_else(|| {
|
||||
// Ask @neondatabase/control-plane for review before adding more.
|
||||
match self.http_status_code {
|
||||
http::StatusCode::NOT_FOUND => {
|
||||
// Status 404: failed to get a project-related resource.
|
||||
format!("{REQUEST_FAILED}: endpoint cannot be found")
|
||||
}
|
||||
http::StatusCode::NOT_ACCEPTABLE => {
|
||||
// Status 406: endpoint is disabled (we don't allow connections).
|
||||
format!("{REQUEST_FAILED}: endpoint is disabled")
|
||||
}
|
||||
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
|
||||
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
|
||||
format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
|
||||
}
|
||||
_ => REQUEST_FAILED.to_owned(),
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ConsoleError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let msg = self
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.details.user_facing_message.as_ref())
|
||||
.map(|m| m.message.as_ref())
|
||||
.unwrap_or_else(|| &self.error);
|
||||
write!(f, "{}", msg)
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetry for ConsoleError {
|
||||
fn could_retry(&self) -> bool {
|
||||
if self.status.is_none() || self.status.as_ref().unwrap().details.retry_info.is_none() {
|
||||
// retry some temporary failures because the compute was in a bad state
|
||||
// (bad request can be returned when the endpoint was in transition)
|
||||
return match &self {
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::BAD_REQUEST,
|
||||
..
|
||||
} => true,
|
||||
// don't retry when quotas are exceeded
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref error,
|
||||
..
|
||||
} => !error.contains("compute time quota of non-primary branches is exceeded"),
|
||||
// locked can be returned when the endpoint was in transition
|
||||
// or when quotas are exceeded. don't retry when quotas are exceeded
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::LOCKED,
|
||||
ref error,
|
||||
..
|
||||
} => {
|
||||
!error.contains("quota exceeded")
|
||||
&& !error.contains("the limit for current plan reached")
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
}
|
||||
|
||||
// retry if the response has a retry delay
|
||||
if let Some(retry_info) = self
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.details.retry_info.as_ref())
|
||||
{
|
||||
retry_info.retry_delay_ms > 0
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Status {
|
||||
pub code: Box<str>,
|
||||
pub message: Box<str>,
|
||||
pub details: Details,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Details {
|
||||
pub error_info: Option<ErrorInfo>,
|
||||
pub retry_info: Option<RetryInfo>,
|
||||
pub user_facing_message: Option<UserFacingMessage>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ErrorInfo {
|
||||
pub reason: Reason,
|
||||
// Schema could also have `metadata` field, but it's not structured. Skip it for now.
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Deserialize, Default)]
|
||||
pub enum Reason {
|
||||
#[serde(rename = "ROLE_PROTECTED")]
|
||||
RoleProtected,
|
||||
#[serde(rename = "RESOURCE_NOT_FOUND")]
|
||||
ResourceNotFound,
|
||||
#[serde(rename = "PROJECT_NOT_FOUND")]
|
||||
ProjectNotFound,
|
||||
#[serde(rename = "ENDPOINT_NOT_FOUND")]
|
||||
EndpointNotFound,
|
||||
#[serde(rename = "BRANCH_NOT_FOUND")]
|
||||
BranchNotFound,
|
||||
#[serde(rename = "RATE_LIMIT_EXCEEDED")]
|
||||
RateLimitExceeded,
|
||||
#[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")]
|
||||
NonPrimaryBranchComputeTimeExceeded,
|
||||
#[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")]
|
||||
ActiveTimeQuotaExceeded,
|
||||
#[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")]
|
||||
ComputeTimeQuotaExceeded,
|
||||
#[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")]
|
||||
WrittenDataQuotaExceeded,
|
||||
#[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")]
|
||||
DataTransferQuotaExceeded,
|
||||
#[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
|
||||
LogicalSizeQuotaExceeded,
|
||||
#[default]
|
||||
#[serde(other)]
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl Reason {
|
||||
pub fn is_not_found(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Reason::ResourceNotFound
|
||||
| Reason::ProjectNotFound
|
||||
| Reason::EndpointNotFound
|
||||
| Reason::BranchNotFound
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct RetryInfo {
|
||||
pub retry_delay_ms: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct UserFacingMessage {
|
||||
pub message: Box<str>,
|
||||
}
|
||||
|
||||
/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
|
||||
|
||||
@@ -25,8 +25,8 @@ use tracing::info;
|
||||
|
||||
pub mod errors {
|
||||
use crate::{
|
||||
console::messages::{self, ConsoleError},
|
||||
error::{io_error, ReportableError, UserFacingError},
|
||||
http,
|
||||
proxy::retry::ShouldRetry,
|
||||
};
|
||||
use thiserror::Error;
|
||||
@@ -34,14 +34,17 @@ pub mod errors {
|
||||
use super::ApiLockError;
|
||||
|
||||
/// A go-to error message which doesn't leak any detail.
|
||||
pub const REQUEST_FAILED: &str = "Console request failed";
|
||||
const REQUEST_FAILED: &str = "Console request failed";
|
||||
|
||||
/// Common console API error.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ApiError {
|
||||
/// Error returned by the console itself.
|
||||
#[error("{REQUEST_FAILED} with {0}")]
|
||||
Console(ConsoleError),
|
||||
#[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
|
||||
Console {
|
||||
status: http::StatusCode,
|
||||
text: Box<str>,
|
||||
},
|
||||
|
||||
/// Various IO errors like broken pipe or malformed payload.
|
||||
#[error("{REQUEST_FAILED}: {0}")]
|
||||
@@ -50,11 +53,11 @@ pub mod errors {
|
||||
|
||||
impl ApiError {
|
||||
/// Returns HTTP status code if it's the reason for failure.
|
||||
pub fn get_reason(&self) -> messages::Reason {
|
||||
pub fn http_status_code(&self) -> Option<http::StatusCode> {
|
||||
use ApiError::*;
|
||||
match self {
|
||||
Console(e) => e.get_reason(),
|
||||
_ => messages::Reason::Unknown,
|
||||
Console { status, .. } => Some(*status),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -64,7 +67,22 @@ pub mod errors {
|
||||
use ApiError::*;
|
||||
match self {
|
||||
// To minimize risks, only select errors are forwarded to users.
|
||||
Console(c) => c.get_user_facing_message(),
|
||||
// Ask @neondatabase/control-plane for review before adding more.
|
||||
Console { status, .. } => match *status {
|
||||
http::StatusCode::NOT_FOUND => {
|
||||
// Status 404: failed to get a project-related resource.
|
||||
format!("{REQUEST_FAILED}: endpoint cannot be found")
|
||||
}
|
||||
http::StatusCode::NOT_ACCEPTABLE => {
|
||||
// Status 406: endpoint is disabled (we don't allow connections).
|
||||
format!("{REQUEST_FAILED}: endpoint is disabled")
|
||||
}
|
||||
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
|
||||
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
|
||||
format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
|
||||
}
|
||||
_ => REQUEST_FAILED.to_owned(),
|
||||
},
|
||||
_ => REQUEST_FAILED.to_owned(),
|
||||
}
|
||||
}
|
||||
@@ -73,56 +91,29 @@ pub mod errors {
|
||||
impl ReportableError for ApiError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
ApiError::Console(e) => {
|
||||
use crate::error::ErrorKind::*;
|
||||
match e.get_reason() {
|
||||
crate::console::messages::Reason::RoleProtected => User,
|
||||
crate::console::messages::Reason::ResourceNotFound => User,
|
||||
crate::console::messages::Reason::ProjectNotFound => User,
|
||||
crate::console::messages::Reason::EndpointNotFound => User,
|
||||
crate::console::messages::Reason::BranchNotFound => User,
|
||||
crate::console::messages::Reason::RateLimitExceeded => ServiceRateLimit,
|
||||
crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
|
||||
User
|
||||
}
|
||||
crate::console::messages::Reason::ActiveTimeQuotaExceeded => User,
|
||||
crate::console::messages::Reason::ComputeTimeQuotaExceeded => User,
|
||||
crate::console::messages::Reason::WrittenDataQuotaExceeded => User,
|
||||
crate::console::messages::Reason::DataTransferQuotaExceeded => User,
|
||||
crate::console::messages::Reason::LogicalSizeQuotaExceeded => User,
|
||||
crate::console::messages::Reason::Unknown => match &e {
|
||||
ConsoleError {
|
||||
http_status_code:
|
||||
http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
..
|
||||
} => crate::error::ErrorKind::User,
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
error,
|
||||
..
|
||||
} if error.contains(
|
||||
"compute time quota of non-primary branches is exceeded",
|
||||
) =>
|
||||
{
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::LOCKED,
|
||||
error,
|
||||
..
|
||||
} if error.contains("quota exceeded")
|
||||
|| error.contains("the limit for current plan reached") =>
|
||||
{
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
|
||||
..
|
||||
} => crate::error::ErrorKind::ServiceRateLimit,
|
||||
ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
|
||||
},
|
||||
}
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
..
|
||||
} => crate::error::ErrorKind::User,
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
text,
|
||||
} if text.contains("compute time quota of non-primary branches is exceeded") => {
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::LOCKED,
|
||||
text,
|
||||
} if text.contains("quota exceeded")
|
||||
|| text.contains("the limit for current plan reached") =>
|
||||
{
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::TOO_MANY_REQUESTS,
|
||||
..
|
||||
} => crate::error::ErrorKind::ServiceRateLimit,
|
||||
ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
|
||||
ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
|
||||
}
|
||||
}
|
||||
@@ -133,7 +124,31 @@ pub mod errors {
|
||||
match self {
|
||||
// retry some transport errors
|
||||
Self::Transport(io) => io.could_retry(),
|
||||
Self::Console(e) => e.could_retry(),
|
||||
// retry some temporary failures because the compute was in a bad state
|
||||
// (bad request can be returned when the endpoint was in transition)
|
||||
Self::Console {
|
||||
status: http::StatusCode::BAD_REQUEST,
|
||||
..
|
||||
} => true,
|
||||
// don't retry when quotas are exceeded
|
||||
Self::Console {
|
||||
status: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref text,
|
||||
} => !text.contains("compute time quota of non-primary branches is exceeded"),
|
||||
// locked can be returned when the endpoint was in transition
|
||||
// or when quotas are exceeded. don't retry when quotas are exceeded
|
||||
Self::Console {
|
||||
status: http::StatusCode::LOCKED,
|
||||
ref text,
|
||||
} => {
|
||||
// written data quota exceeded
|
||||
// data transfer quota exceeded
|
||||
// compute time quota exceeded
|
||||
// logical size quota exceeded
|
||||
!text.contains("quota exceeded")
|
||||
&& !text.contains("the limit for current plan reached")
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -494,7 +509,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
self.metrics
|
||||
.semaphore_acquire_seconds
|
||||
.observe(now.elapsed().as_secs_f64());
|
||||
info!("acquired permit {:?}", now.elapsed().as_secs_f64());
|
||||
|
||||
Ok(WakeComputePermit { permit: permit? })
|
||||
}
|
||||
|
||||
|
||||
@@ -94,14 +94,12 @@ impl Api {
|
||||
let body = match parse_body::<GetRoleSecret>(response).await {
|
||||
Ok(body) => body,
|
||||
// Error 404 is special: it's ok not to have a secret.
|
||||
// TODO(anna): retry
|
||||
Err(e) => {
|
||||
if e.get_reason().is_not_found() {
|
||||
Err(e) => match e.http_status_code() {
|
||||
Some(http::StatusCode::NOT_FOUND) => {
|
||||
return Ok(AuthInfo::default());
|
||||
} else {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
_otherwise => return Err(e.into()),
|
||||
},
|
||||
};
|
||||
|
||||
let secret = if body.role_secret.is_empty() {
|
||||
@@ -330,24 +328,19 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
|
||||
info!("request succeeded, processing the body");
|
||||
return Ok(response.json().await?);
|
||||
}
|
||||
let s = response.bytes().await?;
|
||||
// Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
|
||||
info!("response_error plaintext: {:?}", s);
|
||||
|
||||
// Don't throw an error here because it's not as important
|
||||
// as the fact that the request itself has failed.
|
||||
let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
|
||||
let body = response.json().await.unwrap_or_else(|e| {
|
||||
warn!("failed to parse error body: {e}");
|
||||
ConsoleError {
|
||||
error: "reason unclear (malformed error message)".into(),
|
||||
http_status_code: status,
|
||||
status: None,
|
||||
}
|
||||
});
|
||||
body.http_status_code = status;
|
||||
|
||||
error!("console responded with an error ({status}): {body:?}");
|
||||
Err(ApiError::Console(body))
|
||||
let text = body.error;
|
||||
error!("console responded with an error ({status}): {text}");
|
||||
Err(ApiError::Console { status, text })
|
||||
}
|
||||
|
||||
fn parse_host_port(input: &str) -> Option<(&str, u16)> {
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::auth::backend::{
|
||||
};
|
||||
use crate::config::{CertResolver, RetryConfig};
|
||||
use crate::console::caches::NodeInfoCache;
|
||||
use crate::console::messages::{ConsoleError, MetricsAuxInfo};
|
||||
use crate::console::messages::MetricsAuxInfo;
|
||||
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
|
||||
use crate::console::{self, CachedNodeInfo, NodeInfo};
|
||||
use crate::error::ErrorKind;
|
||||
@@ -484,20 +484,18 @@ impl TestBackend for TestConnectMechanism {
|
||||
match action {
|
||||
ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
|
||||
ConnectAction::WakeFail => {
|
||||
let err = console::errors::ApiError::Console(ConsoleError {
|
||||
http_status_code: http::StatusCode::FORBIDDEN,
|
||||
error: "TEST".into(),
|
||||
status: None,
|
||||
});
|
||||
let err = console::errors::ApiError::Console {
|
||||
status: http::StatusCode::FORBIDDEN,
|
||||
text: "TEST".into(),
|
||||
};
|
||||
assert!(!err.could_retry());
|
||||
Err(console::errors::WakeComputeError::ApiError(err))
|
||||
}
|
||||
ConnectAction::WakeRetry => {
|
||||
let err = console::errors::ApiError::Console(ConsoleError {
|
||||
http_status_code: http::StatusCode::BAD_REQUEST,
|
||||
error: "TEST".into(),
|
||||
status: None,
|
||||
});
|
||||
let err = console::errors::ApiError::Console {
|
||||
status: http::StatusCode::BAD_REQUEST,
|
||||
text: "TEST".into(),
|
||||
};
|
||||
assert!(err.could_retry());
|
||||
Err(console::errors::WakeComputeError::ApiError(err))
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::config::RetryConfig;
|
||||
use crate::console::messages::ConsoleError;
|
||||
use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::metrics::{
|
||||
@@ -89,76 +88,36 @@ fn report_error(e: &WakeComputeError, retry: bool) {
|
||||
let kind = match e {
|
||||
WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
|
||||
WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
|
||||
WakeComputeError::ApiError(ApiError::Console(e)) => match e.get_reason() {
|
||||
crate::console::messages::Reason::RoleProtected => {
|
||||
WakeupFailureKind::ApiConsoleBadRequest
|
||||
}
|
||||
crate::console::messages::Reason::ResourceNotFound => {
|
||||
WakeupFailureKind::ApiConsoleBadRequest
|
||||
}
|
||||
crate::console::messages::Reason::ProjectNotFound => {
|
||||
WakeupFailureKind::ApiConsoleBadRequest
|
||||
}
|
||||
crate::console::messages::Reason::EndpointNotFound => {
|
||||
WakeupFailureKind::ApiConsoleBadRequest
|
||||
}
|
||||
crate::console::messages::Reason::BranchNotFound => {
|
||||
WakeupFailureKind::ApiConsoleBadRequest
|
||||
}
|
||||
crate::console::messages::Reason::RateLimitExceeded => {
|
||||
WakeupFailureKind::ApiConsoleLocked
|
||||
}
|
||||
crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::ActiveTimeQuotaExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::ComputeTimeQuotaExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::WrittenDataQuotaExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::DataTransferQuotaExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::LogicalSizeQuotaExceeded => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
crate::console::messages::Reason::Unknown => match e {
|
||||
ConsoleError {
|
||||
http_status_code: StatusCode::LOCKED,
|
||||
ref error,
|
||||
..
|
||||
} if error.contains("written data quota exceeded")
|
||||
|| error.contains("the limit for current plan reached") =>
|
||||
{
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
ConsoleError {
|
||||
http_status_code: StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref error,
|
||||
..
|
||||
} if error.contains("compute time quota of non-primary branches is exceeded") => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
ConsoleError {
|
||||
http_status_code: StatusCode::LOCKED,
|
||||
..
|
||||
} => WakeupFailureKind::ApiConsoleLocked,
|
||||
ConsoleError {
|
||||
http_status_code: StatusCode::BAD_REQUEST,
|
||||
..
|
||||
} => WakeupFailureKind::ApiConsoleBadRequest,
|
||||
ConsoleError {
|
||||
http_status_code, ..
|
||||
} if http_status_code.is_server_error() => {
|
||||
WakeupFailureKind::ApiConsoleOtherServerError
|
||||
}
|
||||
ConsoleError { .. } => WakeupFailureKind::ApiConsoleOtherError,
|
||||
},
|
||||
},
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::LOCKED,
|
||||
ref text,
|
||||
}) if text.contains("written data quota exceeded")
|
||||
|| text.contains("the limit for current plan reached") =>
|
||||
{
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref text,
|
||||
}) if text.contains("compute time quota of non-primary branches is exceeded") => {
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::LOCKED,
|
||||
..
|
||||
}) => WakeupFailureKind::ApiConsoleLocked,
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::BAD_REQUEST,
|
||||
..
|
||||
}) => WakeupFailureKind::ApiConsoleBadRequest,
|
||||
WakeComputeError::ApiError(ApiError::Console { status, .. })
|
||||
if status.is_server_error() =>
|
||||
{
|
||||
WakeupFailureKind::ApiConsoleOtherServerError
|
||||
}
|
||||
WakeComputeError::ApiError(ApiError::Console { .. }) => {
|
||||
WakeupFailureKind::ApiConsoleOtherError
|
||||
}
|
||||
WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
|
||||
WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
|
||||
};
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::usize;
|
||||
|
||||
use super::{LimitAlgorithm, Outcome, Sample};
|
||||
|
||||
/// Loss-based congestion avoidance.
|
||||
|
||||
@@ -32,6 +32,8 @@ pub struct ClientFirstMessage<'a> {
|
||||
pub bare: &'a str,
|
||||
/// Channel binding mode.
|
||||
pub cbind_flag: ChannelBinding<&'a str>,
|
||||
/// (Client username)[<https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf/src/backend/libpq/auth-scram.c#L13>].
|
||||
pub username: &'a str,
|
||||
/// Client nonce.
|
||||
pub nonce: &'a str,
|
||||
}
|
||||
@@ -56,14 +58,6 @@ impl<'a> ClientFirstMessage<'a> {
|
||||
|
||||
// In theory, these might be preceded by "reserved-mext" (i.e. "m=")
|
||||
let username = parts.next()?.strip_prefix("n=")?;
|
||||
|
||||
// https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14
|
||||
if !username.is_empty() {
|
||||
tracing::warn!(username, "scram username provided, but is not expected")
|
||||
// TODO(conrad):
|
||||
// return None;
|
||||
}
|
||||
|
||||
let nonce = parts.next()?.strip_prefix("r=")?;
|
||||
|
||||
// Validate but ignore auth extensions
|
||||
@@ -72,6 +66,7 @@ impl<'a> ClientFirstMessage<'a> {
|
||||
Some(Self {
|
||||
bare,
|
||||
cbind_flag,
|
||||
username,
|
||||
nonce,
|
||||
})
|
||||
}
|
||||
@@ -193,18 +188,19 @@ mod tests {
|
||||
|
||||
// (Almost) real strings captured during debug sessions
|
||||
let cases = [
|
||||
(NotSupportedClient, "n,,n=,r=t8JwklwKecDLwSsA72rHmVju"),
|
||||
(NotSupportedServer, "y,,n=,r=t8JwklwKecDLwSsA72rHmVju"),
|
||||
(NotSupportedClient, "n,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"),
|
||||
(NotSupportedServer, "y,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"),
|
||||
(
|
||||
Required("tls-server-end-point"),
|
||||
"p=tls-server-end-point,,n=,r=t8JwklwKecDLwSsA72rHmVju",
|
||||
"p=tls-server-end-point,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju",
|
||||
),
|
||||
];
|
||||
|
||||
for (cb, input) in cases {
|
||||
let msg = ClientFirstMessage::parse(input).unwrap();
|
||||
|
||||
assert_eq!(msg.bare, "n=,r=t8JwklwKecDLwSsA72rHmVju");
|
||||
assert_eq!(msg.bare, "n=pepe,r=t8JwklwKecDLwSsA72rHmVju");
|
||||
assert_eq!(msg.username, "pepe");
|
||||
assert_eq!(msg.nonce, "t8JwklwKecDLwSsA72rHmVju");
|
||||
assert_eq!(msg.cbind_flag, cb);
|
||||
}
|
||||
@@ -212,13 +208,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_client_first_message_with_invalid_gs2_authz() {
|
||||
assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none())
|
||||
assert!(ClientFirstMessage::parse("n,authzid,n=user,r=nonce").is_none())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_client_first_message_with_extra_params() {
|
||||
let msg = ClientFirstMessage::parse("n,,n=,r=nonce,a=foo,b=bar,c=baz").unwrap();
|
||||
assert_eq!(msg.bare, "n=,r=nonce,a=foo,b=bar,c=baz");
|
||||
let msg = ClientFirstMessage::parse("n,,n=user,r=nonce,a=foo,b=bar,c=baz").unwrap();
|
||||
assert_eq!(msg.bare, "n=user,r=nonce,a=foo,b=bar,c=baz");
|
||||
assert_eq!(msg.username, "user");
|
||||
assert_eq!(msg.nonce, "nonce");
|
||||
assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient);
|
||||
}
|
||||
@@ -226,9 +223,9 @@ mod tests {
|
||||
#[test]
|
||||
fn parse_client_first_message_with_extra_params_invalid() {
|
||||
// must be of the form `<ascii letter>=<...>`
|
||||
assert!(ClientFirstMessage::parse("n,,n=,r=nonce,abc=foo").is_none());
|
||||
assert!(ClientFirstMessage::parse("n,,n=,r=nonce,1=foo").is_none());
|
||||
assert!(ClientFirstMessage::parse("n,,n=,r=nonce,a").is_none());
|
||||
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,abc=foo").is_none());
|
||||
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,1=foo").is_none());
|
||||
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,a").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -45,10 +45,6 @@ pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
|
||||
err.to_string(),
|
||||
StatusCode::REQUEST_TIMEOUT,
|
||||
),
|
||||
ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
|
||||
this.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.79.0"
|
||||
channel = "1.78.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[package]
|
||||
name = "storage_scrubber"
|
||||
name = "s3_scrubber"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
@@ -1,4 +1,4 @@
|
||||
# Neon Storage Scrubber
|
||||
# Neon S3 scrubber
|
||||
|
||||
This tool directly accesses the S3 buckets used by the Neon `pageserver`
|
||||
and `safekeeper`, and does housekeeping such as cleaning up objects for tenants & timelines that no longer exist.
|
||||
@@ -78,16 +78,17 @@ pub(crate) fn branch_cleanup_and_check_errors(
|
||||
index_part_generation: _index_part_generation,
|
||||
s3_layers: _s3_layers,
|
||||
} => {
|
||||
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) {
|
||||
result
|
||||
.errors
|
||||
.push(format!("index_part.json version: {}", index_part.version()))
|
||||
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
|
||||
result.errors.push(format!(
|
||||
"index_part.json version: {}",
|
||||
index_part.get_version()
|
||||
))
|
||||
}
|
||||
|
||||
if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
|
||||
if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
|
||||
result.warnings.push(format!(
|
||||
"index_part.json version is not latest: {}",
|
||||
index_part.version()
|
||||
index_part.get_version()
|
||||
))
|
||||
}
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use anyhow::bail;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||
use storage_scrubber::pageserver_physical_gc::GcMode;
|
||||
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
|
||||
use storage_scrubber::tenant_snapshot::SnapshotDownloader;
|
||||
use storage_scrubber::{
|
||||
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||
use s3_scrubber::pageserver_physical_gc::GcMode;
|
||||
use s3_scrubber::scan_pageserver_metadata::scan_metadata;
|
||||
use s3_scrubber::tenant_snapshot::SnapshotDownloader;
|
||||
use s3_scrubber::{
|
||||
init_logging, pageserver_physical_gc::pageserver_physical_gc,
|
||||
scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
|
||||
TraversingDepth,
|
||||
@@ -125,7 +125,7 @@ impl MetadataSummary {
|
||||
{
|
||||
*self
|
||||
.indices_by_version
|
||||
.entry(index_part.version())
|
||||
.entry(index_part.get_version())
|
||||
.or_insert(0) += 1;
|
||||
|
||||
if let Err(e) = self.update_histograms(index_part) {
|
||||
@@ -213,9 +213,6 @@ pub async fn main_task(
|
||||
}
|
||||
};
|
||||
|
||||
// remove timeline from the broker active set sooner, before waiting for background tasks
|
||||
tli_broker_active.set(false);
|
||||
|
||||
// shutdown background tasks
|
||||
if conf.is_wal_backup_enabled() {
|
||||
wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
|
||||
use rand::Rng;
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -275,6 +276,13 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
|
||||
debug!("started");
|
||||
let await_duration = conf.partial_backup_timeout;
|
||||
|
||||
// sleep for random time to avoid thundering herd
|
||||
{
|
||||
let randf64 = rand::thread_rng().gen_range(0.0..1.0);
|
||||
let sleep_duration = await_duration.mul_f64(randf64);
|
||||
tokio::time::sleep(sleep_duration).await;
|
||||
}
|
||||
|
||||
let (_, persistent_state) = tli.get_state().await;
|
||||
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
|
||||
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
|
||||
|
||||
@@ -31,7 +31,6 @@ pub(crate) enum PageserverState {
|
||||
Available {
|
||||
last_seen_at: Instant,
|
||||
utilization: PageserverUtilization,
|
||||
new: bool,
|
||||
},
|
||||
Offline,
|
||||
}
|
||||
@@ -128,7 +127,6 @@ impl HeartbeaterTask {
|
||||
heartbeat_futs.push({
|
||||
let jwt_token = self.jwt_token.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
let new_node = !self.state.contains_key(node_id);
|
||||
|
||||
// Clone the node and mark it as available such that the request
|
||||
// goes through to the pageserver even when the node is marked offline.
|
||||
@@ -161,7 +159,6 @@ impl HeartbeaterTask {
|
||||
PageserverState::Available {
|
||||
last_seen_at: Instant::now(),
|
||||
utilization,
|
||||
new: new_node,
|
||||
}
|
||||
} else {
|
||||
PageserverState::Offline
|
||||
@@ -223,7 +220,6 @@ impl HeartbeaterTask {
|
||||
}
|
||||
},
|
||||
Vacant(_) => {
|
||||
// This is a new node. Don't generate a delta for it.
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
|
||||
TenantLocateResponseShard, UtilizationScore,
|
||||
TenantLocateResponseShard,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
@@ -116,16 +116,6 @@ impl Node {
|
||||
match (self.availability, availability) {
|
||||
(Offline, Active(_)) => ToActive,
|
||||
(Active(_), Offline) => ToOffline,
|
||||
// Consider the case when the storage controller handles the re-attach of a node
|
||||
// before the heartbeats detect that the node is back online. We still need
|
||||
// [`Service::node_configure`] to attempt reconciliations for shards with an
|
||||
// unknown observed location.
|
||||
// The unsavoury match arm below handles this situation.
|
||||
(Active(lhs), Active(rhs))
|
||||
if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() =>
|
||||
{
|
||||
ToActive
|
||||
}
|
||||
_ => Unchanged,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,8 +29,6 @@ pub enum MaySchedule {
|
||||
struct SchedulerNode {
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
|
||||
shard_count: usize,
|
||||
/// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
|
||||
attached_shard_count: usize,
|
||||
|
||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||
/// from a node's availability state and scheduling policy).
|
||||
@@ -44,9 +42,7 @@ impl PartialEq for SchedulerNode {
|
||||
(MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
|
||||
);
|
||||
|
||||
may_schedule_matches
|
||||
&& self.shard_count == other.shard_count
|
||||
&& self.attached_shard_count == other.attached_shard_count
|
||||
may_schedule_matches && self.shard_count == other.shard_count
|
||||
}
|
||||
}
|
||||
|
||||
@@ -142,15 +138,6 @@ impl ScheduleContext {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum RefCountUpdate {
|
||||
PromoteSecondary,
|
||||
Attach,
|
||||
Detach,
|
||||
DemoteAttached,
|
||||
AddSecondary,
|
||||
RemoveSecondary,
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
||||
let mut scheduler_nodes = HashMap::new();
|
||||
@@ -159,7 +146,6 @@ impl Scheduler {
|
||||
node.get_id(),
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
},
|
||||
);
|
||||
@@ -185,7 +171,6 @@ impl Scheduler {
|
||||
node.get_id(),
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
},
|
||||
);
|
||||
@@ -194,10 +179,7 @@ impl Scheduler {
|
||||
for shard in shards {
|
||||
if let Some(node_id) = shard.intent.get_attached() {
|
||||
match expect_nodes.get_mut(node_id) {
|
||||
Some(node) => {
|
||||
node.shard_count += 1;
|
||||
node.attached_shard_count += 1;
|
||||
}
|
||||
Some(node) => node.shard_count += 1,
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
shard.tenant_shard_id,
|
||||
@@ -245,42 +227,31 @@ impl Scheduler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update the reference counts of a node. These reference counts are used to guide scheduling
|
||||
/// decisions, not for memory management: they represent the number of tenant shard whose IntentState
|
||||
/// targets this node and the number of tenants shars whose IntentState is attached to this
|
||||
/// node.
|
||||
/// Increment the reference count of a node. This reference count is used to guide scheduling
|
||||
/// decisions, not for memory management: it represents one tenant shard whose IntentState targets
|
||||
/// this node.
|
||||
///
|
||||
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
|
||||
/// [`Self::new`] or [`Self::node_upsert`])
|
||||
pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
|
||||
pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
debug_assert!(false);
|
||||
return;
|
||||
};
|
||||
|
||||
node.shard_count += 1;
|
||||
}
|
||||
|
||||
/// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`].
|
||||
pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
debug_assert!(false);
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
return;
|
||||
};
|
||||
|
||||
match update {
|
||||
RefCountUpdate::PromoteSecondary => {
|
||||
node.attached_shard_count += 1;
|
||||
}
|
||||
RefCountUpdate::Attach => {
|
||||
node.shard_count += 1;
|
||||
node.attached_shard_count += 1;
|
||||
}
|
||||
RefCountUpdate::Detach => {
|
||||
node.shard_count -= 1;
|
||||
node.attached_shard_count -= 1;
|
||||
}
|
||||
RefCountUpdate::DemoteAttached => {
|
||||
node.attached_shard_count -= 1;
|
||||
}
|
||||
RefCountUpdate::AddSecondary => {
|
||||
node.shard_count += 1;
|
||||
}
|
||||
RefCountUpdate::RemoveSecondary => {
|
||||
node.shard_count -= 1;
|
||||
}
|
||||
}
|
||||
node.shard_count -= 1;
|
||||
}
|
||||
|
||||
pub(crate) fn node_upsert(&mut self, node: &Node) {
|
||||
@@ -292,7 +263,6 @@ impl Scheduler {
|
||||
Vacant(entry) => {
|
||||
entry.insert(SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
});
|
||||
}
|
||||
@@ -415,11 +385,6 @@ impl Scheduler {
|
||||
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
|
||||
self.nodes.get(&node_id).unwrap().shard_count
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
|
||||
self.nodes.get(&node_id).unwrap().attached_shard_count
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -472,28 +437,18 @@ mod tests {
|
||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
||||
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
|
||||
t1_intent.push_secondary(&mut scheduler, scheduled);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
|
||||
|
||||
t1_intent.clear(&mut scheduler);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
|
||||
|
||||
let total_attached = scheduler.get_node_attached_shard_count(NodeId(1))
|
||||
+ scheduler.get_node_attached_shard_count(NodeId(2));
|
||||
assert_eq!(total_attached, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Dropping an IntentState without clearing it causes a panic in debug mode,
|
||||
@@ -504,12 +459,8 @@ mod tests {
|
||||
assert!(result.is_err());
|
||||
} else {
|
||||
t2_intent.clear(&mut scheduler);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 0);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 0);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::{
|
||||
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
|
||||
persistence::{AbortShardSplitStatus, TenantFilter},
|
||||
reconciler::{ReconcileError, ReconcileUnits},
|
||||
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
|
||||
scheduler::{ScheduleContext, ScheduleMode},
|
||||
tenant_shard::{
|
||||
MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
|
||||
},
|
||||
@@ -747,61 +747,29 @@ impl Service {
|
||||
let res = self.heartbeater.heartbeat(nodes).await;
|
||||
if let Ok(deltas) = res {
|
||||
for (node_id, state) in deltas.0 {
|
||||
let (new_node, new_availability) = match state {
|
||||
PageserverState::Available {
|
||||
utilization, new, ..
|
||||
} => (
|
||||
new,
|
||||
NodeAvailability::Active(UtilizationScore(
|
||||
utilization.utilization_score,
|
||||
)),
|
||||
let new_availability = match state {
|
||||
PageserverState::Available { utilization, .. } => NodeAvailability::Active(
|
||||
UtilizationScore(utilization.utilization_score),
|
||||
),
|
||||
PageserverState::Offline => (false, NodeAvailability::Offline),
|
||||
PageserverState::Offline => NodeAvailability::Offline,
|
||||
};
|
||||
let res = self
|
||||
.node_configure(node_id, Some(new_availability), None)
|
||||
.await;
|
||||
|
||||
if new_node {
|
||||
// When the heartbeats detect a newly added node, we don't wish
|
||||
// to attempt to reconcile the shards assigned to it. The node
|
||||
// is likely handling it's re-attach response, so reconciling now
|
||||
// would be counterproductive.
|
||||
//
|
||||
// Instead, update the in-memory state with the details learned about the
|
||||
// node.
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, _tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let mut new_nodes = (**nodes).clone();
|
||||
|
||||
if let Some(node) = new_nodes.get_mut(&node_id) {
|
||||
node.set_availability(new_availability);
|
||||
scheduler.node_upsert(node);
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(ApiError::NotFound(_)) => {
|
||||
// This should be rare, but legitimate since the heartbeats are done
|
||||
// on a snapshot of the nodes.
|
||||
tracing::info!("Node {} was not found after heartbeat round", node_id);
|
||||
}
|
||||
|
||||
locked.nodes = Arc::new(new_nodes);
|
||||
} else {
|
||||
// This is the code path for geniune availability transitions (i.e node
|
||||
// goes unavailable and/or comes back online).
|
||||
let res = self
|
||||
.node_configure(node_id, Some(new_availability), None)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(ApiError::NotFound(_)) => {
|
||||
// This should be rare, but legitimate since the heartbeats are done
|
||||
// on a snapshot of the nodes.
|
||||
tracing::info!(
|
||||
"Node {} was not found after heartbeat round",
|
||||
node_id
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::error!(
|
||||
"Failed to update node {} after heartbeat round: {}",
|
||||
node_id,
|
||||
err
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::error!(
|
||||
"Failed to update node {} after heartbeat round: {}",
|
||||
node_id,
|
||||
err
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2441,17 +2409,11 @@ impl Service {
|
||||
(detach_waiters, shard_ids, node.clone())
|
||||
};
|
||||
|
||||
// This reconcile wait can fail in a few ways:
|
||||
// A there is a very long queue for the reconciler semaphore
|
||||
// B some pageserver is failing to handle a detach promptly
|
||||
// C some pageserver goes offline right at the moment we send it a request.
|
||||
//
|
||||
// A and C are transient: the semaphore will eventually become available, and once a node is marked offline
|
||||
// the next attempt to reconcile will silently skip detaches for an offline node and succeed. If B happens,
|
||||
// it's a bug, and needs resolving at the pageserver level (we shouldn't just leave attachments behind while
|
||||
// deleting the underlying data).
|
||||
self.await_waiters(detach_waiters, RECONCILE_TIMEOUT)
|
||||
.await?;
|
||||
if let Err(e) = self.await_waiters(detach_waiters, RECONCILE_TIMEOUT).await {
|
||||
// Failing to detach shouldn't hold up deletion, e.g. if a node is offline we should be able
|
||||
// to use some other node to run the remote deletion.
|
||||
tracing::warn!("Failed to detach some locations: {e}");
|
||||
}
|
||||
|
||||
let locations = shard_ids
|
||||
.into_iter()
|
||||
@@ -2469,11 +2431,13 @@ impl Service {
|
||||
for result in results {
|
||||
match result {
|
||||
Ok(StatusCode::ACCEPTED) => {
|
||||
// This should never happen: we waited for detaches to finish above
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Unexpectedly still attached on {}",
|
||||
// This could happen if we failed detach above, and hit a pageserver where the tenant
|
||||
// is still attached: it will accept the deletion in the background
|
||||
tracing::warn!(
|
||||
"Unexpectedly still attached on {}, client should retry",
|
||||
node
|
||||
)));
|
||||
);
|
||||
return Ok(StatusCode::ACCEPTED);
|
||||
}
|
||||
Ok(_) => {}
|
||||
Err(mgmt_api::Error::Cancelled) => {
|
||||
@@ -4348,17 +4312,7 @@ impl Service {
|
||||
continue;
|
||||
}
|
||||
|
||||
if !new_nodes
|
||||
.values()
|
||||
.any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_)))
|
||||
{
|
||||
// Special case for when all nodes are unavailable and/or unschedulable: there is no point
|
||||
// trying to reschedule since there's nowhere else to go. Without this
|
||||
// branch we incorrectly detach tenants in response to node unavailability.
|
||||
continue;
|
||||
}
|
||||
|
||||
if tenant_shard.intent.demote_attached(scheduler, node_id) {
|
||||
if tenant_shard.intent.demote_attached(node_id) {
|
||||
tenant_shard.sequence = tenant_shard.sequence.next();
|
||||
|
||||
// TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
|
||||
@@ -4395,12 +4349,6 @@ impl Service {
|
||||
// When a node comes back online, we must reconcile any tenant that has a None observed
|
||||
// location on the node.
|
||||
for tenant_shard in locked.tenants.values_mut() {
|
||||
// If a reconciliation is already in progress, rely on the previous scheduling
|
||||
// decision and skip triggering a new reconciliation.
|
||||
if tenant_shard.reconciler.is_some() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
|
||||
if observed_loc.conf.is_none() {
|
||||
self.maybe_reconcile_shard(tenant_shard, &new_nodes);
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::{
|
||||
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
|
||||
persistence::TenantShardPersistence,
|
||||
reconciler::ReconcileUnits,
|
||||
scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
|
||||
scheduler::{AffinityScore, MaySchedule, ScheduleContext},
|
||||
};
|
||||
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
|
||||
use pageserver_api::{
|
||||
@@ -153,7 +153,7 @@ impl IntentState {
|
||||
}
|
||||
pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
|
||||
if let Some(node_id) = node_id {
|
||||
scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach);
|
||||
scheduler.node_inc_ref(node_id);
|
||||
}
|
||||
Self {
|
||||
attached: node_id,
|
||||
@@ -164,10 +164,10 @@ impl IntentState {
|
||||
pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
|
||||
if self.attached != new_attached {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
}
|
||||
if let Some(new_attached) = &new_attached {
|
||||
scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach);
|
||||
scheduler.node_inc_ref(*new_attached);
|
||||
}
|
||||
self.attached = new_attached;
|
||||
}
|
||||
@@ -177,27 +177,22 @@ impl IntentState {
|
||||
/// secondary to attached while maintaining the scheduler's reference counts.
|
||||
pub(crate) fn promote_attached(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
_scheduler: &mut Scheduler,
|
||||
promote_secondary: NodeId,
|
||||
) {
|
||||
// If we call this with a node that isn't in secondary, it would cause incorrect
|
||||
// scheduler reference counting, since we assume the node is already referenced as a secondary.
|
||||
debug_assert!(self.secondary.contains(&promote_secondary));
|
||||
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
self.secondary.retain(|n| n != &promote_secondary);
|
||||
|
||||
let demoted = self.attached;
|
||||
self.attached = Some(promote_secondary);
|
||||
|
||||
scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary);
|
||||
if let Some(demoted) = demoted {
|
||||
scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
|
||||
debug_assert!(!self.secondary.contains(&new_secondary));
|
||||
scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary);
|
||||
scheduler.node_inc_ref(new_secondary);
|
||||
self.secondary.push(new_secondary);
|
||||
}
|
||||
|
||||
@@ -205,27 +200,27 @@ impl IntentState {
|
||||
pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
|
||||
let index = self.secondary.iter().position(|n| *n == node_id);
|
||||
if let Some(index) = index {
|
||||
scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
|
||||
scheduler.node_dec_ref(node_id);
|
||||
self.secondary.remove(index);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
|
||||
for secondary in self.secondary.drain(..) {
|
||||
scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary);
|
||||
scheduler.node_dec_ref(secondary);
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove the last secondary node from the list of secondaries
|
||||
pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
|
||||
if let Some(node_id) = self.secondary.pop() {
|
||||
scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
|
||||
scheduler.node_dec_ref(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
}
|
||||
|
||||
self.clear_secondary(scheduler);
|
||||
@@ -256,11 +251,12 @@ impl IntentState {
|
||||
/// forget the location on the offline node.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn demote_attached(&mut self, scheduler: &mut Scheduler, node_id: NodeId) -> bool {
|
||||
pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -597,7 +593,7 @@ impl TenantShard {
|
||||
Secondary => {
|
||||
if let Some(node_id) = self.intent.get_attached() {
|
||||
// Populate secondary by demoting the attached node
|
||||
self.intent.demote_attached(scheduler, *node_id);
|
||||
self.intent.demote_attached(*node_id);
|
||||
modified = true;
|
||||
} else if self.intent.secondary.is_empty() {
|
||||
// Populate secondary by scheduling a fresh node
|
||||
@@ -787,7 +783,7 @@ impl TenantShard {
|
||||
old_attached_node_id,
|
||||
new_attached_node_id,
|
||||
}) => {
|
||||
self.intent.demote_attached(scheduler, old_attached_node_id);
|
||||
self.intent.demote_attached(old_attached_node_id);
|
||||
self.intent
|
||||
.promote_attached(scheduler, new_attached_node_id);
|
||||
}
|
||||
@@ -1325,9 +1321,7 @@ pub(crate) mod tests {
|
||||
assert_ne!(attached_node_id, secondary_node_id);
|
||||
|
||||
// Notifying the attached node is offline should demote it to a secondary
|
||||
let changed = tenant_shard
|
||||
.intent
|
||||
.demote_attached(&mut scheduler, attached_node_id);
|
||||
let changed = tenant_shard.intent.demote_attached(attached_node_id);
|
||||
assert!(changed);
|
||||
assert!(tenant_shard.intent.attached.is_none());
|
||||
assert_eq!(tenant_shard.intent.secondary.len(), 2);
|
||||
@@ -1610,14 +1604,7 @@ pub(crate) mod tests {
|
||||
|
||||
// We should see equal number of locations on the two nodes.
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
|
||||
// Scheduling does not consider the number of attachments picking the initial
|
||||
// pageserver to attach to (hence the assertion that all primaries are on the
|
||||
// same node)
|
||||
// TODO: Tweak the scheduling to evenly distribute attachments for new shards.
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
|
||||
|
||||
// Add another two nodes: we should see the shards spread out when their optimize
|
||||
// methods are called
|
||||
@@ -1626,16 +1613,9 @@ pub(crate) mod tests {
|
||||
optimize_til_idle(&nodes, &mut scheduler, &mut shards);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1);
|
||||
|
||||
for shard in shards.iter_mut() {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user