mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-26 14:50:36 +00:00
Compare commits
61 Commits
release-56
...
vlad/tmp/g
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fce602ed30 | ||
|
|
fc24ba5233 | ||
|
|
e200a2b01e | ||
|
|
2a7f224306 | ||
|
|
d86ddf2b76 | ||
|
|
86d5f4ada9 | ||
|
|
089edb55e8 | ||
|
|
1302f9442a | ||
|
|
80612d2688 | ||
|
|
7f96ac3435 | ||
|
|
999fbbb2a3 | ||
|
|
d22e0b5398 | ||
|
|
58340f9dbf | ||
|
|
fcbac527b0 | ||
|
|
a5154cf990 | ||
|
|
bfe5df8c4e | ||
|
|
46927bc228 | ||
|
|
bb9c792813 | ||
|
|
126bcc3794 | ||
|
|
4c2100794b | ||
|
|
d3b892e9ad | ||
|
|
7515d0f368 | ||
|
|
e27ce38619 | ||
|
|
e46692788e | ||
|
|
a8ca7a1a1d | ||
|
|
b52e31c1a4 | ||
|
|
5a7e285c2c | ||
|
|
ae5badd375 | ||
|
|
3e63d0f9e0 | ||
|
|
3b647cd55d | ||
|
|
26c68f91f3 | ||
|
|
2078dc827b | ||
|
|
8ee191c271 | ||
|
|
66c6b270f1 | ||
|
|
e4e444f59f | ||
|
|
d46d19456d | ||
|
|
5d05013857 | ||
|
|
014509987d | ||
|
|
75bca9bb19 | ||
|
|
a8be07785e | ||
|
|
630cfbe420 | ||
|
|
0a65333fff | ||
|
|
91dd99038e | ||
|
|
83ab14e271 | ||
|
|
85ef6b1645 | ||
|
|
1a8d53ab9d | ||
|
|
3d6e389aa2 | ||
|
|
17116f2ea9 | ||
|
|
fd22fc5b7d | ||
|
|
0112097e13 | ||
|
|
9d4c113f9b | ||
|
|
0acb604fa3 | ||
|
|
387a36874c | ||
|
|
00032c9d9f | ||
|
|
11bb265de1 | ||
|
|
69026a9a36 | ||
|
|
7006caf3a1 | ||
|
|
69d18d6429 | ||
|
|
acf0a11fea | ||
|
|
c1f55c1525 | ||
|
|
34f450c05a |
@@ -8,6 +8,7 @@
|
||||
!scripts/combine_control_files.py
|
||||
!scripts/ninstall.sh
|
||||
!vm-cgconfig.conf
|
||||
!docker-compose/run-tests.sh
|
||||
|
||||
# Directories
|
||||
!.cargo/
|
||||
|
||||
53
.github/workflows/approved-for-ci-run.yml
vendored
53
.github/workflows/approved-for-ci-run.yml
vendored
@@ -69,15 +69,41 @@ jobs:
|
||||
with:
|
||||
ref: main
|
||||
token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
- name: Look for existing PR
|
||||
id: get-pr
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
|
||||
echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
|
||||
|
||||
- name: Get changed labels
|
||||
id: get-labels
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED != ''
|
||||
env:
|
||||
ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
LABELS_TO_REMOVE=$(comm -23 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) | sort) \
|
||||
<(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\
|
||||
( grep -v run-e2e-tests-in-draft || true ) | paste -sd , -)
|
||||
LABELS_TO_ADD=$(comm -13 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) |sort) \
|
||||
<(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\
|
||||
paste -sd , -)
|
||||
echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
|
||||
echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
|
||||
|
||||
- run: gh pr checkout "${PR_NUMBER}"
|
||||
|
||||
- run: git checkout -b "${BRANCH}"
|
||||
|
||||
- run: git push --force origin "${BRANCH}"
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED == ''
|
||||
|
||||
- name: Create a Pull Request for CI run (if required)
|
||||
env:
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED == ''
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
@@ -88,16 +114,33 @@ jobs:
|
||||
Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
|
||||
EOF
|
||||
|
||||
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
|
||||
if [ -z "${ALREADY_CREATED}" ]; then
|
||||
gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
|
||||
LABELS=$( (gh pr --repo "${GITHUB_REPOSITORY}" view ${PR_NUMBER} --json labels --jq '.labels.[].name'; echo run-e2e-tests-in-draft )| \
|
||||
grep -E '^run' | paste -sd , -)
|
||||
gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
|
||||
--body-file "body.md" \
|
||||
--head "${BRANCH}" \
|
||||
--base "main" \
|
||||
--label "run-e2e-tests-in-draft" \
|
||||
--label ${LABELS} \
|
||||
--draft
|
||||
- name: Modify the existing pull request (if required)
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED != ''
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
LABELS_TO_ADD: ${{ steps.get-labels.outputs.LABELS_TO_ADD }}
|
||||
LABELS_TO_REMOVE: ${{ steps.get-labels.outputs.LABELS_TO_REMOVE }}
|
||||
ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
|
||||
run: |
|
||||
ADD_CMD=
|
||||
REMOVE_CMD=
|
||||
[ -z "${LABELS_TO_ADD}" ] || ADD_CMD="--add-label ${LABELS_TO_ADD}"
|
||||
[ -z "${LABELS_TO_REMOVE}" ] || REMOVE_CMD="--remove-label ${LABELS_TO_REMOVE}"
|
||||
if [ -n "${ADD_CMD}" ] || [ -n "${REMOVE_CMD}" ]; then
|
||||
gh pr --repo "${GITHUB_REPOSITORY}" edit ${ALREADY_CREATED} ${ADD_CMD} ${REMOVE_CMD}
|
||||
fi
|
||||
|
||||
- run: git push --force origin "${BRANCH}"
|
||||
if: steps.get-pr.outputs.ALREADY_CREATED != ''
|
||||
|
||||
cleanup:
|
||||
# Close PRs and delete branchs if the original PR is closed.
|
||||
|
||||
|
||||
29
.github/workflows/build_and_test.yml
vendored
29
.github/workflows/build_and_test.yml
vendored
@@ -858,6 +858,26 @@ jobs:
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Build neon extensions test image
|
||||
if: matrix.version == 'v16'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
target: neon-pg-ext-test
|
||||
cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
@@ -902,6 +922,13 @@ jobs:
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- name: Create multi-arch neon-test-extensions image
|
||||
if: matrix.version == 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
|
||||
|
||||
- name: Create multi-arch compute-tools image
|
||||
if: matrix.version == 'v16'
|
||||
run: |
|
||||
@@ -1020,7 +1047,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify docker-compose example
|
||||
- name: Verify docker-compose example and test extensions
|
||||
timeout-minutes: 20
|
||||
run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
|
||||
|
||||
|
||||
3
Cargo.lock
generated
3
Cargo.lock
generated
@@ -5129,6 +5129,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"hex",
|
||||
"histogram",
|
||||
"humantime",
|
||||
"itertools",
|
||||
"once_cell",
|
||||
"pageserver",
|
||||
@@ -5800,6 +5801,7 @@ dependencies = [
|
||||
"r2d2",
|
||||
"reqwest 0.12.4",
|
||||
"routerify",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"strum",
|
||||
@@ -5819,6 +5821,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
|
||||
@@ -89,7 +89,7 @@ RUN apt update && \
|
||||
# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
|
||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
|
||||
echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
|
||||
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
|
||||
mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make clean && cp -R /sfcgal/* /
|
||||
@@ -98,7 +98,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
|
||||
echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
|
||||
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
@@ -124,7 +124,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg
|
||||
|
||||
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
|
||||
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
@@ -149,7 +149,7 @@ RUN apt update && \
|
||||
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
|
||||
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
|
||||
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
|
||||
mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
|
||||
# generate and copy upgrade scripts
|
||||
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
|
||||
cp upgrade/* /usr/local/pgsql/share/extension/ && \
|
||||
@@ -194,7 +194,7 @@ RUN case "$(uname -m)" in \
|
||||
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
|
||||
echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
|
||||
mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
|
||||
mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && cd build && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
@@ -204,7 +204,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
|
||||
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
|
||||
echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
|
||||
mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
||||
mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -222,7 +222,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
|
||||
echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
|
||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
||||
@@ -243,12 +243,12 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
|
||||
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||
# because we build the images on different machines than where we run them.
|
||||
# Pass OPTFLAGS="" to remove it.
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "fe6c8cb4e0cd1a8cb60f5badf9e1701e0fcabcfc260931c26d01e155c4dd21d1 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
@@ -266,7 +266,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
|
||||
RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
|
||||
echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
|
||||
|
||||
@@ -281,7 +281,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
|
||||
echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
|
||||
mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
|
||||
mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
||||
@@ -297,7 +297,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
|
||||
echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
|
||||
@@ -313,7 +313,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
|
||||
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
|
||||
mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
||||
@@ -329,7 +329,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
|
||||
echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
|
||||
mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
|
||||
@@ -345,7 +345,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
|
||||
echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
|
||||
mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
|
||||
mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
|
||||
@@ -361,7 +361,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
|
||||
echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
|
||||
mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
|
||||
mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
|
||||
@@ -377,7 +377,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
|
||||
echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
|
||||
mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
|
||||
mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
|
||||
@@ -393,7 +393,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
|
||||
echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
|
||||
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
|
||||
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
|
||||
@@ -424,7 +424,7 @@ RUN case "${PG_VERSION}" in \
|
||||
apt-get install -y cmake && \
|
||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
|
||||
echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
|
||||
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
|
||||
mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
|
||||
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||
cd build && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
@@ -462,7 +462,7 @@ RUN case "${PG_VERSION}" in \
|
||||
esac && \
|
||||
wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
|
||||
echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
|
||||
@@ -481,7 +481,7 @@ RUN apt-get update && \
|
||||
apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
|
||||
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
|
||||
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
|
||||
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
|
||||
mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
@@ -505,7 +505,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
|
||||
echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
|
||||
@@ -531,7 +531,7 @@ RUN apt-get update && \
|
||||
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
|
||||
echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
|
||||
mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
|
||||
mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
|
||||
cmake \
|
||||
-D RDK_BUILD_CAIRO_SUPPORT=OFF \
|
||||
-D RDK_BUILD_INCHI_SUPPORT=ON \
|
||||
@@ -571,7 +571,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
|
||||
echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
|
||||
@@ -588,7 +588,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
|
||||
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
|
||||
@@ -605,7 +605,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
|
||||
echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
|
||||
@@ -631,7 +631,7 @@ RUN case "${PG_VERSION}" in \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||
|
||||
@@ -647,7 +647,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
|
||||
@@ -696,7 +696,7 @@ ARG PG_VERSION
|
||||
|
||||
RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
|
||||
echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
||||
@@ -713,7 +713,7 @@ ARG PG_VERSION
|
||||
|
||||
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
|
||||
echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
@@ -733,7 +733,7 @@ ARG PG_VERSION
|
||||
# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
|
||||
RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
|
||||
echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
|
||||
|
||||
@@ -749,7 +749,7 @@ ARG PG_VERSION
|
||||
|
||||
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
|
||||
echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
|
||||
wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
|
||||
patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
|
||||
@@ -771,7 +771,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
|
||||
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
|
||||
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
|
||||
mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||
|
||||
@@ -787,7 +787,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
|
||||
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
|
||||
@@ -804,7 +804,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
|
||||
echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
|
||||
mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
|
||||
@@ -928,6 +928,69 @@ RUN rm -r /usr/local/pgsql/include
|
||||
# if they were to be used by other libraries.
|
||||
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer neon-pg-ext-test
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM neon-pg-ext-build AS neon-pg-ext-test
|
||||
ARG PG_VERSION
|
||||
RUN mkdir /ext-src
|
||||
|
||||
#COPY --from=postgis-build /postgis.tar.gz /ext-src/
|
||||
#COPY --from=postgis-build /sfcgal/* /usr
|
||||
COPY --from=plv8-build /plv8.tar.gz /ext-src/
|
||||
COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
|
||||
COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.patch /ext-src/
|
||||
COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
|
||||
#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
|
||||
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
|
||||
COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
|
||||
COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
|
||||
#COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
|
||||
COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
|
||||
COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
|
||||
COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||
COPY patches/pg_hintplan.patch /ext-src
|
||||
#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
|
||||
COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
|
||||
COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
|
||||
#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
|
||||
#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
|
||||
COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
|
||||
COPY patches/pg_anon.patch /ext-src
|
||||
COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
|
||||
COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
|
||||
RUN cd /ext-src/ && for f in *.tar.gz; \
|
||||
do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
|
||||
rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
|
||||
|| exit 1; rm -f $f; done
|
||||
RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
# cmake is required for the h3 test
|
||||
RUN apt-get update && apt-get install -y cmake
|
||||
RUN patch -p1 < /ext-src/pg_hintplan.patch
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN patch -p1 </ext-src/pg_anon.patch
|
||||
RUN patch -p1 </ext-src/pg_cron.patch
|
||||
ENV PATH=/usr/local/pgsql/bin:$PATH
|
||||
ENV PGHOST=compute
|
||||
ENV PGPORT=55433
|
||||
ENV PGUSER=cloud_admin
|
||||
ENV PGDATABASE=postgres
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
ALTER ROLE neon_superuser BYPASSRLS;
|
||||
18
compute_tools/src/migrations/0001-alter_roles.sql
Normal file
18
compute_tools/src/migrations/0001-alter_roles.sql
Normal file
@@ -0,0 +1,18 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
|
||||
END LOOP;
|
||||
|
||||
FOR role_name IN SELECT rolname FROM pg_roles
|
||||
WHERE
|
||||
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
|
||||
END LOOP;
|
||||
END $$;
|
||||
@@ -0,0 +1,6 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
|
||||
END IF;
|
||||
END $$;
|
||||
@@ -0,0 +1 @@
|
||||
GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;
|
||||
@@ -0,0 +1,4 @@
|
||||
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
|
||||
-- interacted with by neon_superuser without permission issues.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser;
|
||||
@@ -0,0 +1,4 @@
|
||||
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
|
||||
-- interacted with by neon_superuser without permission issues.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser;
|
||||
@@ -0,0 +1,3 @@
|
||||
-- SKIP: Moved inline to the handle_grants() functions.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
|
||||
@@ -0,0 +1,3 @@
|
||||
-- SKIP: Moved inline to the handle_grants() functions.
|
||||
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;
|
||||
@@ -0,0 +1,13 @@
|
||||
-- SKIP: The original goal of this migration was to prevent creating
|
||||
-- subscriptions, but this migration was insufficient.
|
||||
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
END LOOP;
|
||||
END $$;
|
||||
@@ -774,44 +774,21 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
// !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
|
||||
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
// Add new migrations in numerical order.
|
||||
let migrations = [
|
||||
"ALTER ROLE neon_superuser BYPASSRLS",
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
|
||||
END LOOP;
|
||||
|
||||
FOR role_name IN SELECT rolname FROM pg_roles
|
||||
WHERE
|
||||
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
|
||||
END LOOP;
|
||||
END $$;
|
||||
"#,
|
||||
r#"
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
"GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
|
||||
// Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
|
||||
include_str!("./migrations/0001-alter_roles.sql"),
|
||||
include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
|
||||
include_str!(
|
||||
"./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!(
|
||||
"./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
|
||||
];
|
||||
|
||||
let mut func = || {
|
||||
@@ -847,10 +824,13 @@ $$;"#,
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
let migration = &migrations[current_migration];
|
||||
if migration.is_empty() {
|
||||
info!("Skip migration id={}", current_migration);
|
||||
if migration.starts_with("-- SKIP") {
|
||||
info!("Skipping migration id={}", current_migration);
|
||||
} else {
|
||||
info!("Running migration:\n{}\n", migration);
|
||||
info!(
|
||||
"Running migration id={}:\n{}\n",
|
||||
current_migration, migration
|
||||
);
|
||||
client.simple_query(migration).with_context(|| {
|
||||
format!("handle_migrations current_migration={}", current_migration)
|
||||
})?;
|
||||
|
||||
@@ -9,6 +9,7 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
|
||||
@@ -7,8 +7,9 @@ use pageserver_api::{
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
@@ -125,6 +126,28 @@ enum Command {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
|
||||
/// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
|
||||
TenantDrop {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
unclean: bool,
|
||||
},
|
||||
NodeDrop {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
unclean: bool,
|
||||
},
|
||||
TenantSetTimeBasedEviction {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
period: humantime::Duration,
|
||||
#[arg(long)]
|
||||
threshold: humantime::Duration,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -674,6 +697,46 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
Command::TenantDrop { tenant_id, unclean } => {
|
||||
if !unclean {
|
||||
anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed.")
|
||||
}
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::POST,
|
||||
format!("debug/v1/tenant/{tenant_id}/drop"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::NodeDrop { node_id, unclean } => {
|
||||
if !unclean {
|
||||
anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed.")
|
||||
}
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantSetTimeBasedEviction {
|
||||
tenant_id,
|
||||
period,
|
||||
threshold,
|
||||
} => {
|
||||
vps_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
config: TenantConfig {
|
||||
eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
|
||||
EvictionPolicyLayerAccessThreshold {
|
||||
period: period.into(),
|
||||
threshold: threshold.into(),
|
||||
},
|
||||
)),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -8,6 +8,11 @@ USER root
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl \
|
||||
jq \
|
||||
python3-pip \
|
||||
netcat
|
||||
#Faker is required for the pg_anon test
|
||||
RUN pip3 install Faker
|
||||
#This is required for the pg_hintplan test
|
||||
RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src
|
||||
|
||||
USER postgres
|
||||
USER postgres
|
||||
@@ -95,7 +95,7 @@
|
||||
},
|
||||
{
|
||||
"name": "shared_preload_libraries",
|
||||
"value": "neon",
|
||||
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
@@ -127,6 +127,16 @@
|
||||
"name": "max_replication_flush_lag",
|
||||
"value": "10GB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "cron.database",
|
||||
"value": "postgres",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "session_preload_libraries",
|
||||
"value": "anon",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
minio:
|
||||
restart: always
|
||||
@@ -194,3 +192,13 @@ services:
|
||||
done"
|
||||
depends_on:
|
||||
- compute
|
||||
|
||||
neon-test-extensions:
|
||||
image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
|
||||
entrypoint:
|
||||
- "/bin/bash"
|
||||
- "-c"
|
||||
command:
|
||||
- sleep 1800
|
||||
depends_on:
|
||||
- compute
|
||||
|
||||
@@ -7,15 +7,21 @@
|
||||
# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
|
||||
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
|
||||
# to verify custom image builds (e.g pre-published ones).
|
||||
|
||||
#
|
||||
# A test script for postgres extensions
|
||||
# Currently supports only v16
|
||||
#
|
||||
set -eux -o pipefail
|
||||
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
|
||||
|
||||
COMPOSE_FILE='docker-compose.yml'
|
||||
cd $(dirname $0)
|
||||
docker compose -f $COMPOSE_FILE
|
||||
COMPUTE_CONTAINER_NAME=docker-compose-compute-1
|
||||
SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
|
||||
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
|
||||
TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
|
||||
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
|
||||
: ${http_proxy:=}
|
||||
: ${https_proxy:=}
|
||||
export http_proxy https_proxy
|
||||
|
||||
cleanup() {
|
||||
echo "show container information"
|
||||
@@ -25,34 +31,71 @@ cleanup() {
|
||||
docker compose -f $COMPOSE_FILE down
|
||||
}
|
||||
|
||||
echo "clean up containers if exists"
|
||||
cleanup
|
||||
|
||||
for pg_version in 14 15 16; do
|
||||
echo "start containers (pg_version=$pg_version)."
|
||||
PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
|
||||
echo "clean up containers if exists"
|
||||
cleanup
|
||||
PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
|
||||
PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose -f $COMPOSE_FILE up --build -d
|
||||
|
||||
echo "wait until the compute is ready. timeout after 60s. "
|
||||
cnt=0
|
||||
while sleep 1; do
|
||||
while sleep 3; do
|
||||
# check timeout
|
||||
cnt=`expr $cnt + 1`
|
||||
cnt=`expr $cnt + 3`
|
||||
if [ $cnt -gt 60 ]; then
|
||||
echo "timeout before the compute is ready."
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the compute is ready
|
||||
set +o pipefail
|
||||
result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
|
||||
set -o pipefail
|
||||
if [ $result -eq 1 ]; then
|
||||
if docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
|
||||
echo "OK. The compute is ready to connect."
|
||||
echo "execute simple queries."
|
||||
docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
|
||||
cleanup
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $pg_version -ge 16 ]
|
||||
then
|
||||
echo Enabling trust connection
|
||||
docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
|
||||
echo Adding postgres role
|
||||
docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
|
||||
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config
|
||||
docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
|
||||
# This block is required for the pg_anon extension test.
|
||||
# The test assumes that it is running on the same host with the postgres engine.
|
||||
# In our case it's not true, that's why we are copying files to the compute node
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
|
||||
echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
|
||||
rm -rf $TMPDIR
|
||||
TMPDIR=$(mktemp -d)
|
||||
# The following block does the same for the pg_hintplan test
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
|
||||
rm -rf $TMPDIR
|
||||
# We are running tests now
|
||||
if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
$TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
|
||||
then
|
||||
cleanup
|
||||
else
|
||||
FAILED=$(tail -1 testout.txt)
|
||||
for d in $FAILED
|
||||
do
|
||||
mkdir $d
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
|
||||
cat $d/regression.out $d/regression.diffs || true
|
||||
done
|
||||
rm -rf $FAILED
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
cleanup
|
||||
done
|
||||
|
||||
15
docker-compose/run-tests.sh
Normal file
15
docker-compose/run-tests.sh
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
cd /ext-src
|
||||
FAILED=
|
||||
LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
|
||||
for d in ${LIST}
|
||||
do
|
||||
[ -d ${d} ] || continue
|
||||
psql -c "select 1" >/dev/null || break
|
||||
make -C ${d} installcheck || FAILED="${d} ${FAILED}"
|
||||
done
|
||||
[ -z "${FAILED}" ] && exit 0
|
||||
echo ${FAILED}
|
||||
exit 1
|
||||
@@ -209,6 +209,7 @@ pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
Filling,
|
||||
Pause,
|
||||
PauseForRestart,
|
||||
Draining,
|
||||
}
|
||||
|
||||
@@ -220,6 +221,7 @@ impl FromStr for NodeSchedulingPolicy {
|
||||
"active" => Ok(Self::Active),
|
||||
"filling" => Ok(Self::Filling),
|
||||
"pause" => Ok(Self::Pause),
|
||||
"pause_for_restart" => Ok(Self::PauseForRestart),
|
||||
"draining" => Ok(Self::Draining),
|
||||
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
}
|
||||
@@ -233,6 +235,7 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
Active => "active",
|
||||
Filling => "filling",
|
||||
Pause => "pause",
|
||||
PauseForRestart => "pause_for_restart",
|
||||
Draining => "draining",
|
||||
}
|
||||
.to_string()
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, BE};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::RepOriginId;
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{fmt, ops::Range};
|
||||
@@ -38,6 +39,9 @@ pub const RELATION_SIZE_PREFIX: u8 = 0x61;
|
||||
/// The key prefix of AUX file keys.
|
||||
pub const AUX_KEY_PREFIX: u8 = 0x62;
|
||||
|
||||
/// The key prefix of ReplOrigin keys.
|
||||
pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
|
||||
|
||||
/// Check if the key falls in the range of metadata keys.
|
||||
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
|
||||
key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
|
||||
@@ -385,9 +389,11 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_size_key(key: &Key) -> bool {
|
||||
key.field1 == 0 && key.field6 == u32::MAX
|
||||
impl Key {
|
||||
#[inline(always)]
|
||||
pub fn is_rel_size_key(&self) -> bool {
|
||||
self.field1 == 0 && self.field6 == u32::MAX
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@@ -478,12 +484,14 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_slru_segment_size_key(key: &Key) -> bool {
|
||||
key.field1 == 0x01
|
||||
&& key.field2 < 0x03
|
||||
&& key.field3 == 0x01
|
||||
&& key.field5 == 0
|
||||
&& key.field6 == u32::MAX
|
||||
impl Key {
|
||||
pub fn is_slru_segment_size_key(&self) -> bool {
|
||||
self.field1 == 0x01
|
||||
&& self.field2 < 0x03
|
||||
&& self.field3 == 0x01
|
||||
&& self.field5 == 0
|
||||
&& self.field6 == u32::MAX
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@@ -583,6 +591,37 @@ pub const AUX_FILES_KEY: Key = Key {
|
||||
field6: 2,
|
||||
};
|
||||
|
||||
#[inline(always)]
|
||||
pub fn repl_origin_key(origin_id: RepOriginId) -> Key {
|
||||
Key {
|
||||
field1: REPL_ORIGIN_KEY_PREFIX,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: origin_id as u32,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the range of replorigin keys.
|
||||
pub fn repl_origin_key_range() -> Range<Key> {
|
||||
Key {
|
||||
field1: REPL_ORIGIN_KEY_PREFIX,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: REPL_ORIGIN_KEY_PREFIX,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0x10000,
|
||||
}
|
||||
}
|
||||
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
@@ -591,73 +630,78 @@ pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
|
||||
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
|
||||
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
|
||||
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(key: Key) -> bool {
|
||||
!NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
|
||||
}
|
||||
impl Key {
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(self) -> bool {
|
||||
!NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn is_rel_fsm_block_key(self) -> bool {
|
||||
self.field1 == 0x00
|
||||
&& self.field4 != 0
|
||||
&& self.field5 == FSM_FORKNUM
|
||||
&& self.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_vm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00
|
||||
&& key.field4 != 0
|
||||
&& key.field5 == VISIBILITYMAP_FORKNUM
|
||||
&& key.field6 != 0xffffffff
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn is_rel_vm_block_key(self) -> bool {
|
||||
self.field1 == 0x00
|
||||
&& self.field4 != 0
|
||||
&& self.field5 == VISIBILITYMAP_FORKNUM
|
||||
&& self.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x01 => {
|
||||
let kind = match key.field2 {
|
||||
0x00 => SlruKind::Clog,
|
||||
0x01 => SlruKind::MultiXactMembers,
|
||||
0x02 => SlruKind::MultiXactOffsets,
|
||||
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
|
||||
};
|
||||
let segno = key.field4;
|
||||
let blknum = key.field6;
|
||||
#[inline(always)]
|
||||
pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match self.field1 {
|
||||
0x01 => {
|
||||
let kind = match self.field2 {
|
||||
0x00 => SlruKind::Clog,
|
||||
0x01 => SlruKind::MultiXactMembers,
|
||||
0x02 => SlruKind::MultiXactOffsets,
|
||||
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2),
|
||||
};
|
||||
let segno = self.field4;
|
||||
let blknum = self.field6;
|
||||
|
||||
(kind, segno, blknum)
|
||||
}
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
(kind, segno, blknum)
|
||||
}
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_slru_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x01 // SLRU-related
|
||||
&& key.field3 == 0x00000001 // but not SlruDir
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn is_slru_block_key(self) -> bool {
|
||||
self.field1 == 0x01 // SLRU-related
|
||||
&& self.field3 == 0x00000001 // but not SlruDir
|
||||
&& self.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn is_rel_block_key(&self) -> bool {
|
||||
self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x00 => (
|
||||
RelTag {
|
||||
spcnode: key.field2,
|
||||
dbnode: key.field3,
|
||||
relnode: key.field4,
|
||||
forknum: key.field5,
|
||||
},
|
||||
key.field6,
|
||||
),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match self.field1 {
|
||||
0x00 => (
|
||||
RelTag {
|
||||
spcnode: self.field2,
|
||||
dbnode: self.field3,
|
||||
relnode: self.field4,
|
||||
forknum: self.field5,
|
||||
},
|
||||
self.field6,
|
||||
),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use std::{ops::RangeInclusive, str::FromStr};
|
||||
|
||||
use crate::{
|
||||
key::{is_rel_block_key, Key},
|
||||
models::ShardParameters,
|
||||
};
|
||||
use crate::{key::Key, models::ShardParameters};
|
||||
use hex::FromHex;
|
||||
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -672,7 +669,7 @@ fn key_is_shard0(key: &Key) -> bool {
|
||||
// because they must be included in basebackups.
|
||||
let is_initfork = key.field5 == INIT_FORKNUM;
|
||||
|
||||
!is_rel_block_key(key) || is_initfork
|
||||
!key.is_rel_block_key() || is_initfork
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
|
||||
@@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> {
|
||||
.allowlist_type("PageHeaderData")
|
||||
.allowlist_type("DBState")
|
||||
.allowlist_type("RelMapFile")
|
||||
.allowlist_type("RepOriginId")
|
||||
// Because structs are used for serialization, tell bindgen to emit
|
||||
// explicit padding fields.
|
||||
.explicit_padding(true)
|
||||
|
||||
@@ -110,6 +110,7 @@ pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
|
||||
// Export some widely used datatypes that are unlikely to change across Postgres versions
|
||||
pub use v14::bindings::RepOriginId;
|
||||
pub use v14::bindings::{uint32, uint64, Oid};
|
||||
pub use v14::bindings::{BlockNumber, OffsetNumber};
|
||||
pub use v14::bindings::{MultiXactId, TransactionId};
|
||||
|
||||
@@ -102,7 +102,7 @@ pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
|
||||
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
|
||||
pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
|
||||
pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
|
||||
// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
|
||||
pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
|
||||
// pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
|
||||
// pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;
|
||||
|
||||
@@ -167,6 +167,7 @@ pub const RM_RELMAP_ID: u8 = 7;
|
||||
pub const RM_STANDBY_ID: u8 = 8;
|
||||
pub const RM_HEAP2_ID: u8 = 9;
|
||||
pub const RM_HEAP_ID: u8 = 10;
|
||||
pub const RM_REPLORIGIN_ID: u8 = 19;
|
||||
pub const RM_LOGICALMSG_ID: u8 = 21;
|
||||
|
||||
// from neon_rmgr.h
|
||||
@@ -223,6 +224,10 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
|
||||
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLP_LONG_HEADER: u16 = 0x0002;
|
||||
|
||||
/* From xlog.h */
|
||||
pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
|
||||
pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;
|
||||
|
||||
/* From replication/slot.h */
|
||||
pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */
|
||||
+ 64 /* NameData */ + 4*4;
|
||||
@@ -237,6 +242,9 @@ pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
|
||||
pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
|
||||
(BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
|
||||
|
||||
/* From origin.c */
|
||||
pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
|
||||
|
||||
// List of subdirectories inside pgdata.
|
||||
// Copied from src/bin/initdb/initdb.c
|
||||
pub const PGDATA_SUBDIRS: [&str; 22] = [
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fmt::Display;
|
||||
use std::io;
|
||||
use std::num::NonZeroU32;
|
||||
use std::pin::Pin;
|
||||
@@ -29,6 +30,7 @@ use http_types::{StatusCode, Url};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
use utils::backoff;
|
||||
|
||||
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
|
||||
use crate::{
|
||||
@@ -451,26 +453,58 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
// TODO batch requests are not supported by the SDK
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
|
||||
for path in paths {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
|
||||
let request = blob_client.delete().into_future();
|
||||
|
||||
let res = tokio::time::timeout(self.timeout, request).await;
|
||||
|
||||
match res {
|
||||
Ok(Ok(_response)) => continue,
|
||||
Ok(Err(e)) => {
|
||||
if let Some(http_err) = e.as_http_error() {
|
||||
if http_err.status() == StatusCode::NotFound {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return Err(e.into());
|
||||
}
|
||||
Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
#[derive(Debug)]
|
||||
enum AzureOrTimeout {
|
||||
AzureError(azure_core::Error),
|
||||
Timeout,
|
||||
Cancel,
|
||||
}
|
||||
}
|
||||
impl Display for AzureOrTimeout {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{self:?}")
|
||||
}
|
||||
}
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 5;
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
|
||||
let request = blob_client.delete().into_future();
|
||||
|
||||
let res = tokio::time::timeout(self.timeout, request).await;
|
||||
|
||||
match res {
|
||||
Ok(Ok(_v)) => Ok(()),
|
||||
Ok(Err(azure_err)) => {
|
||||
if let Some(http_err) = azure_err.as_http_error() {
|
||||
if http_err.status() == StatusCode::NotFound {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
Err(AzureOrTimeout::AzureError(azure_err))
|
||||
}
|
||||
Err(_elapsed) => Err(AzureOrTimeout::Timeout),
|
||||
}
|
||||
},
|
||||
|err| match err {
|
||||
AzureOrTimeout::AzureError(_) | AzureOrTimeout::Timeout => false,
|
||||
AzureOrTimeout::Cancel => true,
|
||||
},
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"deleting remote object",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| AzureOrTimeout::Cancel)
|
||||
.and_then(|x| x)
|
||||
.map_err(|e| match e {
|
||||
AzureOrTimeout::AzureError(err) => anyhow::Error::from(err),
|
||||
AzureOrTimeout::Timeout => TimeoutOrCancel::Timeout.into(),
|
||||
AzureOrTimeout::Cancel => TimeoutOrCancel::Cancel.into(),
|
||||
})?;
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
|
||||
|
||||
@@ -78,6 +78,10 @@ where
|
||||
let e = Err(std::io::Error::from(e));
|
||||
return Poll::Ready(Some(e));
|
||||
}
|
||||
} else {
|
||||
// this would be perfectly valid behaviour for doing a graceful completion on the
|
||||
// download for example, but not one we expect to do right now.
|
||||
tracing::warn!("continuing polling after having cancelled or timeouted");
|
||||
}
|
||||
|
||||
this.inner.poll_next(cx)
|
||||
@@ -89,13 +93,22 @@ where
|
||||
}
|
||||
|
||||
/// Fires only on the first cancel or timeout, not on both.
|
||||
pub(crate) async fn cancel_or_timeout(
|
||||
pub(crate) fn cancel_or_timeout(
|
||||
timeout: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> TimeoutOrCancel {
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
|
||||
_ = cancel.cancelled() => TimeoutOrCancel::Cancel,
|
||||
) -> impl std::future::Future<Output = TimeoutOrCancel> + 'static {
|
||||
// futures are lazy, they don't do anything before being polled.
|
||||
//
|
||||
// "precalculate" the wanted deadline before returning the future, so that we can use pause
|
||||
// failpoint to trigger a timeout in test.
|
||||
let deadline = tokio::time::Instant::now() + timeout;
|
||||
async move {
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout,
|
||||
_ = cancel.cancelled() => {
|
||||
TimeoutOrCancel::Cancel
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,4 +185,31 @@ mod tests {
|
||||
_ = tokio::time::sleep(Duration::from_secs(121)) => {},
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn notified_but_pollable_after() {
|
||||
let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static(
|
||||
b"hello world",
|
||||
))));
|
||||
let timeout = Duration::from_secs(120);
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
cancel.cancel();
|
||||
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
|
||||
let next = stream.next().await;
|
||||
let ioe = next.unwrap().unwrap_err();
|
||||
assert!(
|
||||
matches!(
|
||||
ioe.get_ref().unwrap().downcast_ref::<DownloadError>(),
|
||||
Some(&DownloadError::Cancelled)
|
||||
),
|
||||
"{ioe:?}"
|
||||
);
|
||||
|
||||
let next = stream.next().await;
|
||||
let bytes = next.unwrap().unwrap();
|
||||
assert_eq!(&b"hello world"[..], bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,9 @@ use std::{fs, io, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
mod rename_noreplace;
|
||||
pub use rename_noreplace::rename_noreplace;
|
||||
|
||||
pub trait PathExt {
|
||||
/// Returns an error if `self` is not a directory.
|
||||
fn is_empty_dir(&self) -> io::Result<bool>;
|
||||
|
||||
109
libs/utils/src/fs_ext/rename_noreplace.rs
Normal file
109
libs/utils/src/fs_ext/rename_noreplace.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use nix::NixPath;
|
||||
|
||||
/// Rename a file without replacing an existing file.
|
||||
///
|
||||
/// This is a wrapper around platform-specific APIs.
|
||||
pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
|
||||
src: &P1,
|
||||
dst: &P2,
|
||||
) -> nix::Result<()> {
|
||||
{
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
nix::fcntl::renameat2(
|
||||
None,
|
||||
src,
|
||||
None,
|
||||
dst,
|
||||
nix::fcntl::RenameFlags::RENAME_NOREPLACE,
|
||||
)
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let res = src.with_nix_path(|src| {
|
||||
dst.with_nix_path(|dst|
|
||||
// SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np.
|
||||
unsafe {
|
||||
nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL)
|
||||
})
|
||||
})??;
|
||||
nix::errno::Errno::result(res).map(drop)
|
||||
}
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
{
|
||||
std::compile_error!("OS does not support no-replace renames");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn testdir() -> camino_tempfile::Utf8TempDir {
|
||||
match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") {
|
||||
Some(path) => {
|
||||
let path: camino::Utf8PathBuf = path;
|
||||
camino_tempfile::tempdir_in(path).unwrap()
|
||||
}
|
||||
None => camino_tempfile::tempdir().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_absolute_paths() {
|
||||
let testdir = testdir();
|
||||
println!("testdir: {}", testdir.path());
|
||||
|
||||
let src = testdir.path().join("src");
|
||||
let dst = testdir.path().join("dst");
|
||||
|
||||
fs::write(&src, b"").unwrap();
|
||||
fs::write(&dst, b"").unwrap();
|
||||
|
||||
let src = src.canonicalize().unwrap();
|
||||
assert!(src.is_absolute());
|
||||
let dst = dst.canonicalize().unwrap();
|
||||
assert!(dst.is_absolute());
|
||||
|
||||
let result = rename_noreplace(&src, &dst);
|
||||
assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relative_paths() {
|
||||
let testdir = testdir();
|
||||
println!("testdir: {}", testdir.path());
|
||||
|
||||
// this is fine because we run in nextest => process per test
|
||||
std::env::set_current_dir(testdir.path()).unwrap();
|
||||
|
||||
let src = PathBuf::from("src");
|
||||
let dst = PathBuf::from("dst");
|
||||
|
||||
fs::write(&src, b"").unwrap();
|
||||
fs::write(&dst, b"").unwrap();
|
||||
|
||||
let result = rename_noreplace(&src, &dst);
|
||||
assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_works_when_not_exists() {
|
||||
let testdir = testdir();
|
||||
println!("testdir: {}", testdir.path());
|
||||
|
||||
let src = testdir.path().join("src");
|
||||
let dst = testdir.path().join("dst");
|
||||
|
||||
fs::write(&src, b"content").unwrap();
|
||||
|
||||
rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap();
|
||||
assert_eq!(
|
||||
"content",
|
||||
String::from_utf8(std::fs::read(&dst).unwrap()).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -34,6 +34,9 @@ pub enum ApiError {
|
||||
#[error("Timeout")]
|
||||
Timeout(Cow<'static, str>),
|
||||
|
||||
#[error("Request cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(anyhow::Error),
|
||||
}
|
||||
@@ -74,6 +77,10 @@ impl ApiError {
|
||||
err.to_string(),
|
||||
StatusCode::REQUEST_TIMEOUT,
|
||||
),
|
||||
ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
|
||||
self.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
@@ -133,6 +140,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
|
||||
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
|
||||
ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
|
||||
ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
|
||||
ApiError::Cancelled => info!("Request cancelled while processing HTTP request"),
|
||||
_ => info!("Error processing HTTP request: {api_error:#}"),
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,6 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
|
||||
use utils::lsn::Lsn;
|
||||
use pageserver::tenant::IndexPart;
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
pub(crate) enum IndexPartCmd {
|
||||
@@ -17,20 +12,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
IndexPartCmd::Dump { path } => {
|
||||
let bytes = tokio::fs::read(path).await.context("read file")?;
|
||||
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output<'a> {
|
||||
layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
timeline_metadata: &'a TimelineMetadata,
|
||||
}
|
||||
|
||||
let output = Output {
|
||||
layer_metadata: &des.layer_metadata,
|
||||
disk_consistent_lsn: des.get_disk_consistent_lsn(),
|
||||
timeline_metadata: &des.metadata,
|
||||
};
|
||||
|
||||
let output = serde_json::to_string_pretty(&output).context("serialize output")?;
|
||||
let output = serde_json::to_string_pretty(&des).context("serialize output")?;
|
||||
println!("{output}");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -72,13 +72,14 @@ impl DescribeKeyCommand {
|
||||
println!("{key:?}");
|
||||
|
||||
macro_rules! kind_query {
|
||||
([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}};
|
||||
($name:ident) => {{
|
||||
let s: &'static str = stringify!($name);
|
||||
let s = s.strip_prefix("is_").unwrap_or(s);
|
||||
let s = s.strip_suffix("_key").unwrap_or(s);
|
||||
|
||||
#[allow(clippy::needless_borrow)]
|
||||
(s, pageserver_api::key::$name(key))
|
||||
(s, key.$name())
|
||||
}};
|
||||
}
|
||||
|
||||
@@ -86,18 +87,15 @@ impl DescribeKeyCommand {
|
||||
// "recognization". I think it accurately represents how strictly we model the Key
|
||||
// right now, but could of course be made less confusing.
|
||||
|
||||
let queries = [
|
||||
("rel_block", pageserver_api::key::is_rel_block_key(&key)),
|
||||
kind_query!(is_rel_vm_block_key),
|
||||
kind_query!(is_rel_fsm_block_key),
|
||||
kind_query!(is_slru_block_key),
|
||||
kind_query!(is_inherited_key),
|
||||
("rel_size", pageserver_api::key::is_rel_size_key(&key)),
|
||||
(
|
||||
"slru_segment_size",
|
||||
pageserver_api::key::is_slru_segment_size_key(&key),
|
||||
),
|
||||
];
|
||||
let queries = kind_query!([
|
||||
is_rel_block_key,
|
||||
is_rel_vm_block_key,
|
||||
is_rel_fsm_block_key,
|
||||
is_slru_block_key,
|
||||
is_inherited_key,
|
||||
is_rel_size_key,
|
||||
is_slru_segment_size_key,
|
||||
]);
|
||||
|
||||
let recognized_kind = "recognized kind";
|
||||
let metadata_key = "metadata key";
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::PagestreamGetPageRequest;
|
||||
|
||||
@@ -187,7 +187,7 @@ async fn main_impl(
|
||||
for r in partitioning.keys.ranges.iter() {
|
||||
let mut i = r.start;
|
||||
while i != r.end {
|
||||
if is_rel_block_key(&i) {
|
||||
if i.is_rel_block_key() {
|
||||
filtered.add_key(i);
|
||||
}
|
||||
i = i.next();
|
||||
@@ -308,9 +308,10 @@ async fn main_impl(
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(is_rel_block_key(&key));
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
|
||||
@@ -178,7 +178,8 @@ impl AuxFileSizeEstimator {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn on_base_backup(&self, new_size: usize) {
|
||||
/// When generating base backup or doing initial logical size calculation
|
||||
pub fn on_initial(&self, new_size: usize) {
|
||||
let mut guard = self.size.lock().unwrap();
|
||||
*guard = Some(new_size as isize);
|
||||
self.report(new_size as isize);
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{key_to_slru_block, Key};
|
||||
use pageserver_api::key::Key;
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
@@ -170,7 +170,7 @@ where
|
||||
}
|
||||
|
||||
async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
|
||||
let (kind, segno, _) = key_to_slru_block(*key)?;
|
||||
let (kind, segno, _) = key.to_slru_block()?;
|
||||
|
||||
match kind {
|
||||
SlruKind::Clog => {
|
||||
@@ -362,6 +362,13 @@ where
|
||||
));
|
||||
info!("Replication slot {} restart LSN={}", path, restart_lsn);
|
||||
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
|
||||
} else if path == "pg_logical/replorigin_checkpoint" {
|
||||
// replorigin_checkoint is written only on compute shutdown, so it contains
|
||||
// deteriorated values. So we generate our own version of this file for the particular LSN
|
||||
// based on information about replorigins extracted from transaction commit records.
|
||||
// In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
|
||||
// but now we should handle (skip) it for backward compatibility.
|
||||
continue;
|
||||
}
|
||||
let header = new_tar_header(&path, content.len() as u64)?;
|
||||
self.ar
|
||||
@@ -390,6 +397,32 @@ where
|
||||
{
|
||||
self.add_twophase_file(xid).await?;
|
||||
}
|
||||
let repl_origins = self
|
||||
.timeline
|
||||
.get_replorigins(self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
let n_origins = repl_origins.len();
|
||||
if n_origins != 0 {
|
||||
//
|
||||
// Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins
|
||||
// extracted from transaction commit record. We are using this file to pass information about replication
|
||||
// origins to compute to allow logical replication to restart from proper point.
|
||||
//
|
||||
let mut content = Vec::with_capacity(n_origins * 16 + 8);
|
||||
content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes());
|
||||
for (origin_id, origin_lsn) in repl_origins {
|
||||
content.extend_from_slice(&origin_id.to_le_bytes());
|
||||
content.extend_from_slice(&[0u8; 6]); // align to 8 bytes
|
||||
content.extend_from_slice(&origin_lsn.0.to_le_bytes());
|
||||
}
|
||||
let crc32 = crc32c::crc32c(&content);
|
||||
content.extend_from_slice(&crc32.to_le_bytes());
|
||||
let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
|
||||
self.ar.append(&header, &*content).await.context(
|
||||
"could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
|
||||
)?;
|
||||
}
|
||||
|
||||
fail_point!("basebackup-before-control-file", |_| {
|
||||
Err(BasebackupError::Server(anyhow!(
|
||||
|
||||
@@ -99,8 +99,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -146,8 +144,6 @@ pub mod defaults {
|
||||
|
||||
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
||||
|
||||
#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -300,8 +296,6 @@ pub struct PageServerConf {
|
||||
///
|
||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
|
||||
pub walredo_process_kind: crate::walredo::ProcessKind,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -407,8 +401,6 @@ struct PageServerConfigBuilder {
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
|
||||
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -497,8 +489,6 @@ impl PageServerConfigBuilder {
|
||||
)),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
|
||||
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -686,10 +676,6 @@ impl PageServerConfigBuilder {
|
||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
|
||||
self.walredo_process_kind = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -747,7 +733,6 @@ impl PageServerConfigBuilder {
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
walredo_process_kind,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -1044,9 +1029,6 @@ impl PageServerConf {
|
||||
"ephemeral_bytes_per_memory_kb" => {
|
||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||
}
|
||||
"walredo_process_kind" => {
|
||||
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1130,7 +1112,6 @@ impl PageServerConf {
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1370,7 +1351,6 @@ background_task_maximum_delay = '334 s'
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1444,7 +1424,6 @@ background_task_maximum_delay = '334 s'
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -81,8 +81,10 @@ paths:
|
||||
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
|
||||
404 means that deletion successfully finished"
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant was successfully deleted, or was already not found.
|
||||
"404":
|
||||
description: Tenant not found. This is the success path.
|
||||
description: Tenant not found. This is a success result, equivalent to 200.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
|
||||
@@ -181,9 +181,7 @@ impl From<PageReconstructError> for ApiError {
|
||||
PageReconstructError::MissingKey(e) => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("{e}"))
|
||||
}
|
||||
PageReconstructError::Cancelled => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
PageReconstructError::Cancelled => ApiError::Cancelled,
|
||||
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||
}
|
||||
@@ -1073,7 +1071,7 @@ async fn tenant_delete_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
state
|
||||
let status = state
|
||||
.tenant_manager
|
||||
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
|
||||
.instrument(info_span!("tenant_delete_handler",
|
||||
@@ -1082,7 +1080,14 @@ async fn tenant_delete_handler(
|
||||
))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
// Callers use 404 as success for deletions, for historical reasons.
|
||||
if status == StatusCode::NOT_FOUND {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Deletion complete").into(),
|
||||
));
|
||||
}
|
||||
|
||||
json_response(status, ())
|
||||
}
|
||||
|
||||
/// HTTP endpoint to query the current tenant_size of a tenant.
|
||||
@@ -2182,7 +2187,7 @@ async fn tenant_scan_remote_handler(
|
||||
{
|
||||
Ok((index_part, index_generation)) => {
|
||||
tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
|
||||
index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
|
||||
index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
|
||||
generation = std::cmp::max(generation, index_generation);
|
||||
}
|
||||
Err(DownloadError::NotFound) => {
|
||||
@@ -2424,6 +2429,25 @@ async fn list_aux_files(
|
||||
json_response(StatusCode::OK, files)
|
||||
}
|
||||
|
||||
async fn perf_info(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let result = timeline.perf_info().await;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
async fn ingest_aux_files(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -2851,5 +2875,9 @@ pub fn make_router(
|
||||
|r| testing_api_handler("list_aux_files", r, list_aux_files),
|
||||
)
|
||||
.post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
|
||||
|r| testing_api_handler("perf_info", r, perf_info),
|
||||
)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -2108,6 +2108,7 @@ pub(crate) struct TimelineMetrics {
|
||||
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
|
||||
pub evictions: IntCounter,
|
||||
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
|
||||
shutdown: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
impl TimelineMetrics {
|
||||
@@ -2227,6 +2228,7 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
shutdown: std::sync::atomic::AtomicBool::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2249,6 +2251,17 @@ impl TimelineMetrics {
|
||||
}
|
||||
|
||||
pub(crate) fn shutdown(&self) {
|
||||
let was_shutdown = self
|
||||
.shutdown
|
||||
.swap(true, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
if was_shutdown {
|
||||
// this happens on tenant deletion because tenant first shuts down timelines, then
|
||||
// invokes timeline deletion which first shuts down the timeline again.
|
||||
// TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
|
||||
return;
|
||||
}
|
||||
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
|
||||
@@ -17,8 +17,8 @@ use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
@@ -27,7 +27,7 @@ use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, TimestampTz, TransactionId};
|
||||
use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::ControlFlow;
|
||||
@@ -36,6 +36,7 @@ use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
@@ -409,6 +410,8 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||
pausable_failpoint!("find-lsn-for-timestamp-pausable");
|
||||
|
||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||
// We use this method to figure out the branching LSN for the new branch, but the
|
||||
// GC cutoff could be before the branching point and we cannot create a new branch
|
||||
@@ -424,6 +427,7 @@ impl Timeline {
|
||||
|
||||
let mut found_smaller = false;
|
||||
let mut found_larger = false;
|
||||
|
||||
while low < high {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(PageReconstructError::Cancelled);
|
||||
@@ -718,10 +722,22 @@ impl Timeline {
|
||||
result.insert(fname, content);
|
||||
}
|
||||
}
|
||||
self.aux_file_size_estimator.on_base_backup(sz);
|
||||
self.aux_file_size_estimator.on_initial(sz);
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) async fn trigger_aux_file_size_computation(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
|
||||
self.list_aux_files_v2(lsn, ctx).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn list_aux_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
@@ -760,6 +776,27 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_replorigins(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
|
||||
let kv = self
|
||||
.scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
|
||||
.await
|
||||
.context("scan")?;
|
||||
let mut result = HashMap::new();
|
||||
for (k, v) in kv {
|
||||
let v = v.context("get value")?;
|
||||
let origin_id = k.field6 as RepOriginId;
|
||||
let origin_lsn = Lsn::des(&v).unwrap();
|
||||
if origin_lsn != Lsn::INVALID {
|
||||
result.insert(origin_id, origin_lsn);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used to initialize the logical size tracking on startup.
|
||||
///
|
||||
@@ -885,7 +922,9 @@ impl Timeline {
|
||||
Ok((
|
||||
result.to_keyspace(),
|
||||
/* AUX sparse key space */
|
||||
SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
|
||||
SparseKeySpace(KeySpace {
|
||||
ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
|
||||
}),
|
||||
))
|
||||
}
|
||||
|
||||
@@ -1154,6 +1193,20 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn set_replorigin(
|
||||
&mut self,
|
||||
origin_id: RepOriginId,
|
||||
origin_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let key = repl_origin_key(origin_id);
|
||||
self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
|
||||
self.set_replorigin(origin_id, Lsn::INVALID).await
|
||||
}
|
||||
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
self.put(CONTROLFILE_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
@@ -1684,7 +1737,7 @@ impl<'a> DatadirModification<'a> {
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
for (key, values) in self.pending_updates.drain() {
|
||||
for (lsn, value) in values {
|
||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||
if key.is_rel_block_key() || key.is_slru_block_key() {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
|
||||
@@ -3395,6 +3395,12 @@ impl Tenant {
|
||||
let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
|
||||
let unfinished_timeline = raw_timeline.raw_timeline()?;
|
||||
|
||||
// Flush the new layer files to disk, before we make the timeline as available to
|
||||
// the outside world.
|
||||
//
|
||||
// Flush loop needs to be spawned in order to be able to flush.
|
||||
unfinished_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
import_datadir::import_timeline_from_postgres_datadir(
|
||||
unfinished_timeline,
|
||||
&pgdata_path,
|
||||
@@ -3406,12 +3412,6 @@ impl Tenant {
|
||||
format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
// Flush the new layer files to disk, before we make the timeline as available to
|
||||
// the outside world.
|
||||
//
|
||||
// Flush loop needs to be spawned in order to be able to flush.
|
||||
unfinished_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
anyhow::bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
@@ -3865,6 +3865,9 @@ pub(crate) mod harness {
|
||||
pub fn create_custom(
|
||||
test_name: &'static str,
|
||||
tenant_conf: TenantConf,
|
||||
tenant_id: TenantId,
|
||||
shard_identity: ShardIdentity,
|
||||
generation: Generation,
|
||||
) -> anyhow::Result<Self> {
|
||||
setup_logging();
|
||||
|
||||
@@ -3877,8 +3880,12 @@ pub(crate) mod harness {
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let shard = shard_identity.shard_index();
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: shard.shard_number,
|
||||
shard_count: shard.shard_count,
|
||||
};
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
|
||||
fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?;
|
||||
|
||||
@@ -3896,8 +3903,8 @@ pub(crate) mod harness {
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_shard_id,
|
||||
generation: Generation::new(0xdeadbeef),
|
||||
shard: ShardIndex::unsharded(),
|
||||
generation,
|
||||
shard,
|
||||
remote_storage,
|
||||
remote_fs_dir,
|
||||
deletion_queue,
|
||||
@@ -3912,8 +3919,15 @@ pub(crate) mod harness {
|
||||
compaction_period: Duration::ZERO,
|
||||
..TenantConf::default()
|
||||
};
|
||||
|
||||
Self::create_custom(test_name, tenant_conf)
|
||||
let tenant_id = TenantId::generate();
|
||||
let shard = ShardIdentity::unsharded();
|
||||
Self::create_custom(
|
||||
test_name,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
shard,
|
||||
Generation::new(0xdeadbeef),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn span(&self) -> tracing::Span {
|
||||
@@ -3992,8 +4006,8 @@ pub(crate) mod harness {
|
||||
let base_img = base_img.expect("Neon WAL redo requires base image").1;
|
||||
let mut page = BytesMut::new();
|
||||
page.extend_from_slice(&base_img);
|
||||
for (_record_lsn, record) in records {
|
||||
apply_neon::apply_in_neon(&record, key, &mut page)?;
|
||||
for (record_lsn, record) in records {
|
||||
apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
|
||||
}
|
||||
Ok(page.freeze())
|
||||
} else {
|
||||
@@ -4030,13 +4044,16 @@ mod tests {
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hex_literal::hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use rand::{thread_rng, Rng};
|
||||
use storage_layer::PersistentLayerKey;
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::TenantId;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||
@@ -4936,7 +4953,13 @@ mod tests {
|
||||
..TenantConf::default()
|
||||
};
|
||||
|
||||
let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?;
|
||||
let harness = TenantHarness::create_custom(
|
||||
"test_get_vectored_key_gap",
|
||||
tenant_conf,
|
||||
TenantId::generate(),
|
||||
ShardIdentity::unsharded(),
|
||||
Generation::new(0xdeadbeef),
|
||||
)?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
@@ -6469,4 +6492,369 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_vectored_impl_wrapper(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Bytes>, GetVectoredError> {
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let mut res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key..key.next()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
Ok(res.pop_last().map(|(k, v)| {
|
||||
assert_eq!(k, key);
|
||||
v.unwrap()
|
||||
}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
|
||||
let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
|
||||
|
||||
// We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones
|
||||
// Lsn 0x30 key0, key3, no key1+key2
|
||||
// Lsn 0x20 key1+key2 tomestones
|
||||
// Lsn 0x10 key1 in image, key2 in delta
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
// delta layers
|
||||
vec![
|
||||
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
||||
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
],
|
||||
// image layers
|
||||
vec![
|
||||
(Lsn(0x10), vec![(key1, test_img("metadata key 1"))]),
|
||||
(
|
||||
Lsn(0x30),
|
||||
vec![
|
||||
(key0, test_img("metadata key 0")),
|
||||
(key3, test_img("metadata key 3")),
|
||||
],
|
||||
),
|
||||
],
|
||||
Lsn(0x30),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x30);
|
||||
let old_lsn = Lsn(0x20);
|
||||
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key 0"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?,
|
||||
None,
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?,
|
||||
None,
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?,
|
||||
Some(Bytes::new()),
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?,
|
||||
Some(Bytes::new()),
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key 3"))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_image_creation() {
|
||||
let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
|
||||
let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
// delta layers
|
||||
vec![
|
||||
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
||||
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
vec![
|
||||
(key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
|
||||
(key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
|
||||
],
|
||||
],
|
||||
// image layers
|
||||
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
|
||||
Lsn(0x30),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceImageLayerCreation);
|
||||
flags.insert(CompactFlags::ForceRepartition);
|
||||
flags
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Image layers are created at last_record_lsn
|
||||
let images = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.filter(|(k, _)| k.is_metadata_key())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_tombstone_empty_image_creation() {
|
||||
let harness =
|
||||
TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
// delta layers
|
||||
vec![
|
||||
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
|
||||
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
|
||||
],
|
||||
// image layers
|
||||
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
|
||||
Lsn(0x30),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceImageLayerCreation);
|
||||
flags.insert(CompactFlags::ForceRepartition);
|
||||
flags
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Image layers are created at last_record_lsn
|
||||
let images = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.filter(|(k, _)| k.is_metadata_key())
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
// We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
|
||||
//
|
||||
// | D1 | | D3 |
|
||||
// -| |-- gc horizon -----------------
|
||||
// | | | D2 |
|
||||
// --------- img layer ------------------
|
||||
//
|
||||
// What we should expact from this compaction is:
|
||||
// | Part of D1 | | D3 |
|
||||
// --------- img layer with D1+D2 at GC horizon------------------
|
||||
|
||||
// img layer at 0x10
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), test_img(&format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
// TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose.
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::Image(test_img("value 1@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x30),
|
||||
Value::Image(test_img("value 2@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x40),
|
||||
Value::Image(test_img("value 3@0x40")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::Image(test_img("value 5@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(6),
|
||||
Lsn(0x20),
|
||||
Value::Image(test_img("value 6@0x20")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x40),
|
||||
Value::Image(test_img("value 8@0x40")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x40),
|
||||
Value::Image(test_img("value 9@0x40")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![delta1, delta2, delta3], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.pitr = Lsn(0x30);
|
||||
guard.cutoffs.horizon = Lsn(0x30);
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
|
||||
// Check if the image layer at the GC horizon contains exactly what we want
|
||||
let image_at_gc_horizon = tline
|
||||
.inspect_image_layers(Lsn(0x30), &ctx)
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.filter(|(k, _)| k.is_metadata_key())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(image_at_gc_horizon.len(), 10);
|
||||
let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10];
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
image_at_gc_horizon[idx],
|
||||
(
|
||||
get_key(idx as u32),
|
||||
test_img(&format!("value {idx}@{:#x}", expected_lsn[idx]))
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Check if old layers are removed / new layers have the expected LSN
|
||||
let mut all_layers = tline.inspect_historic_layers().await.unwrap();
|
||||
all_layers.sort_by(|k1, k2| {
|
||||
(
|
||||
k1.is_delta,
|
||||
k1.key_range.start,
|
||||
k1.key_range.end,
|
||||
k1.lsn_range.start,
|
||||
k1.lsn_range.end,
|
||||
)
|
||||
.cmp(&(
|
||||
k2.is_delta,
|
||||
k2.key_range.start,
|
||||
k2.key_range.end,
|
||||
k2.lsn_range.start,
|
||||
k2.lsn_range.end,
|
||||
))
|
||||
});
|
||||
assert_eq!(
|
||||
all_layers,
|
||||
vec![
|
||||
// Image layer at GC horizon
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..get_key(10),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||
is_delta: false
|
||||
},
|
||||
// The delta layer that is cut in the middle
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..get_key(9),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x41),
|
||||
is_delta: true
|
||||
},
|
||||
// The delta layer we created and should not be picked for the compaction
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x40)..Lsn(0x41),
|
||||
is_delta: true
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ use crate::{
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||
remote_timeline_client::remote_heatmap_path,
|
||||
timeline::ShutdownMode,
|
||||
},
|
||||
};
|
||||
@@ -531,6 +532,25 @@ impl DeleteTenantFlow {
|
||||
}
|
||||
}
|
||||
|
||||
// Remove top-level tenant objects that don't belong to a timeline, such as heatmap
|
||||
let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
|
||||
if let Some(Err(e)) = backoff::retry(
|
||||
|| async {
|
||||
remote_storage
|
||||
.delete(&heatmap_path, &task_mgr::shutdown_token())
|
||||
.await
|
||||
},
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_remote_tenant_heatmap",
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
|
||||
}
|
||||
|
||||
let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
|
||||
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
|
||||
if timelines_path.exists() {
|
||||
|
||||
@@ -1,15 +1,23 @@
|
||||
//! Every image of a certain timeline from [`crate::tenant::Tenant`]
|
||||
//! has a metadata that needs to be stored persistently.
|
||||
//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
|
||||
//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
|
||||
//! this struct and it's original serialization format is still needed because they were written a
|
||||
//! long time ago.
|
||||
//!
|
||||
//! Later, the file gets used in [`remote_timeline_client`] as a part of
|
||||
//! external storage import and export operations.
|
||||
//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
|
||||
//! versioning.
|
||||
//!
|
||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||
//! To clean up this module we need to migrate all index_part.json files to a later version.
|
||||
//! While doing this, we need to be mindful about s3 based recovery as well, so it might take
|
||||
//! however long we keep the old versions to be able to delete the old code. After that, we can
|
||||
//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and
|
||||
//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards
|
||||
//! compatibility.
|
||||
//!
|
||||
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||
//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart
|
||||
|
||||
use anyhow::ensure;
|
||||
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
|
||||
|
||||
@@ -17,17 +25,37 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
|
||||
const METADATA_FORMAT_VERSION: u16 = 4;
|
||||
|
||||
/// Previous supported format versions.
|
||||
///
|
||||
/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming
|
||||
/// that requires a scrubber run which is yet to be done.
|
||||
const METADATA_OLD_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
|
||||
/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic.
|
||||
///
|
||||
/// This is the same assumption that PostgreSQL makes with the control file,
|
||||
///
|
||||
/// see PG_CONTROL_MAX_SAFE_SIZE
|
||||
const METADATA_MAX_SIZE: usize = 512;
|
||||
|
||||
/// Metadata stored on disk for each timeline
|
||||
/// Legacy metadata stored as a component of `index_part.json` per timeline.
|
||||
///
|
||||
/// The fields correspond to the values we hold in memory, in Timeline.
|
||||
/// Do not make new changes to this type or the module. In production, we have two different kinds
|
||||
/// of serializations of this type: bincode and json. Bincode version reflects what used to be
|
||||
/// stored on disk in earlier versions and does internal crc32 checksumming.
|
||||
///
|
||||
/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would
|
||||
/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern
|
||||
/// as-exists in `index_part.json` ([`self::modern_serde`]).
|
||||
///
|
||||
/// ```compile_fail
|
||||
/// #[derive(serde::Serialize)]
|
||||
/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata);
|
||||
/// ```
|
||||
///
|
||||
/// ```compile_fail
|
||||
/// #[derive(serde::Deserialize)]
|
||||
/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata);
|
||||
/// ```
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TimelineMetadata {
|
||||
hdr: TimelineMetadataHeader,
|
||||
@@ -40,6 +68,49 @@ struct TimelineMetadataHeader {
|
||||
size: u16, // size of serialized metadata
|
||||
format_version: u16, // metadata format version (used for compatibility checks)
|
||||
}
|
||||
|
||||
impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
|
||||
type Error = Crc32CalculationFailed;
|
||||
|
||||
fn try_from(value: &TimelineMetadataBodyV2) -> Result<Self, Self::Error> {
|
||||
#[derive(Default)]
|
||||
struct Crc32Sink {
|
||||
crc: u32,
|
||||
count: usize,
|
||||
}
|
||||
|
||||
impl std::io::Write for Crc32Sink {
|
||||
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
self.crc = crc32c::crc32c_append(self.crc, buf);
|
||||
self.count += buf.len();
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
|
||||
// across serialization versions
|
||||
let mut sink = Crc32Sink::default();
|
||||
<TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(value, &mut sink)
|
||||
.map_err(Crc32CalculationFailed)?;
|
||||
|
||||
let size = METADATA_HDR_SIZE + sink.count;
|
||||
|
||||
Ok(TimelineMetadataHeader {
|
||||
checksum: sink.crc,
|
||||
size: size as u16,
|
||||
format_version: METADATA_FORMAT_VERSION,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("re-serializing for crc32 failed")]
|
||||
struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
|
||||
|
||||
const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -111,6 +182,12 @@ impl TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result<Self> {
|
||||
self.hdr = TimelineMetadataHeader::try_from(&self.body)?;
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
|
||||
|
||||
@@ -261,25 +338,93 @@ impl TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for TimelineMetadata {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let bytes = Vec::<u8>::deserialize(deserializer)?;
|
||||
Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
|
||||
}
|
||||
}
|
||||
pub(crate) mod modern_serde {
|
||||
use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
impl Serialize for TimelineMetadata {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let bytes = self
|
||||
.to_bytes()
|
||||
.map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
|
||||
bytes.serialize(serializer)
|
||||
// for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec<u8> field with
|
||||
// BeSer.
|
||||
struct Visitor;
|
||||
|
||||
impl<'d> serde::de::Visitor<'d> for Visitor {
|
||||
type Value = TimelineMetadata;
|
||||
|
||||
fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
f.write_str("BeSer bytes or json structure")
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::SeqAccess<'d>,
|
||||
{
|
||||
use serde::de::Error;
|
||||
let de = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||
Vec::<u8>::deserialize(de)
|
||||
.map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))?
|
||||
}
|
||||
|
||||
fn visit_map<A>(self, map: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::MapAccess<'d>,
|
||||
{
|
||||
use serde::de::Error;
|
||||
|
||||
let de = serde::de::value::MapAccessDeserializer::new(map);
|
||||
let body = TimelineMetadataBodyV2::deserialize(de)?;
|
||||
let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?;
|
||||
|
||||
Ok(TimelineMetadata { hdr, body })
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_any(Visitor)
|
||||
}
|
||||
|
||||
pub(crate) fn serialize<S>(
|
||||
metadata: &TimelineMetadata,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
// header is not needed, upon reading we've upgraded all v1 to v2
|
||||
metadata.body.serialize(serializer)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deserializes_bytes_as_well_as_equivalent_body_v2() {
|
||||
#[derive(serde::Deserialize, serde::Serialize)]
|
||||
struct Wrapper(
|
||||
#[serde(deserialize_with = "deserialize", serialize_with = "serialize")]
|
||||
TimelineMetadata,
|
||||
);
|
||||
|
||||
let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
|
||||
|
||||
let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
|
||||
|
||||
let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
serialized,
|
||||
serde_json::json! {{
|
||||
"disk_consistent_lsn": "0/149FD90",
|
||||
"prev_record_lsn": "0/149FD18",
|
||||
"ancestor_timeline": null,
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/149FD18",
|
||||
"initdb_lsn": "0/149FD18",
|
||||
"pg_version": 15
|
||||
}}
|
||||
);
|
||||
|
||||
let wrapper_from_json = serde_json::value::from_value::<Wrapper>(serialized).unwrap();
|
||||
|
||||
assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -403,59 +548,6 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_metadata_bincode_serde() {
|
||||
let original_metadata = TimelineMetadata::new(
|
||||
Lsn(0x200),
|
||||
Some(Lsn(0x100)),
|
||||
Some(TIMELINE_ID),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
// Any version will do here, so use the default
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
);
|
||||
let metadata_bytes = original_metadata
|
||||
.to_bytes()
|
||||
.expect("Cannot create bytes array from metadata");
|
||||
|
||||
let metadata_bincode_be_bytes = original_metadata
|
||||
.ser()
|
||||
.expect("Cannot serialize the metadata");
|
||||
|
||||
// 8 bytes for the length of the vector
|
||||
assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
|
||||
|
||||
let expected_bincode_bytes = {
|
||||
let mut temp = vec![];
|
||||
let len_bytes = metadata_bytes.len().to_be_bytes();
|
||||
temp.extend_from_slice(&len_bytes);
|
||||
temp.extend_from_slice(&metadata_bytes);
|
||||
temp
|
||||
};
|
||||
assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
|
||||
|
||||
let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
|
||||
// Deserialized metadata has the metadata header, which is different from the serialized one.
|
||||
// Reference: TimelineMetaData::to_bytes()
|
||||
let expected_metadata = {
|
||||
let mut temp_metadata = original_metadata;
|
||||
let body_bytes = temp_metadata
|
||||
.body
|
||||
.ser()
|
||||
.expect("Cannot serialize the metadata body");
|
||||
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
|
||||
let hdr = TimelineMetadataHeader {
|
||||
size: metadata_size as u16,
|
||||
format_version: METADATA_FORMAT_VERSION,
|
||||
checksum: crc32c::crc32c(&body_bytes),
|
||||
};
|
||||
temp_metadata.hdr = hdr;
|
||||
temp_metadata
|
||||
};
|
||||
assert_eq!(deserialized_metadata, expected_metadata);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_metadata_bincode_serde_ensure_roundtrip() {
|
||||
let original_metadata = TimelineMetadata::new(
|
||||
@@ -469,8 +561,6 @@ mod tests {
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
);
|
||||
let expected_bytes = vec![
|
||||
/* bincode length encoding bytes */
|
||||
0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
|
||||
/* TimelineMetadataHeader */
|
||||
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
/* TimelineMetadataBodyV2 */
|
||||
@@ -500,7 +590,7 @@ mod tests {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
let metadata_ser_bytes = original_metadata.ser().unwrap();
|
||||
let metadata_ser_bytes = original_metadata.to_bytes().unwrap();
|
||||
assert_eq!(metadata_ser_bytes, expected_bytes);
|
||||
|
||||
let expected_metadata = {
|
||||
@@ -518,7 +608,7 @@ mod tests {
|
||||
temp_metadata.hdr = hdr;
|
||||
temp_metadata
|
||||
};
|
||||
let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
|
||||
let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap();
|
||||
assert_eq!(des_metadata, expected_metadata);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use hyper::StatusCode;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
@@ -54,6 +55,7 @@ use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::remote_timeline_client::remote_tenant_path;
|
||||
use super::secondary::SecondaryTenant;
|
||||
use super::timeline::detach_ancestor::PreparedTimelineDetach;
|
||||
use super::TenantSharedResources;
|
||||
@@ -1369,7 +1371,7 @@ impl TenantManager {
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
activation_timeout: Duration,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
) -> Result<StatusCode, DeleteTenantError> {
|
||||
super::span::debug_assert_current_span_has_tenant_id();
|
||||
// We acquire a SlotGuard during this function to protect against concurrent
|
||||
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
|
||||
@@ -1382,18 +1384,79 @@ impl TenantManager {
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
let slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
// Tenant deletion can happen two ways:
|
||||
// - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
|
||||
// state until deletion is complete.
|
||||
// - New: called on a pageserver without an attached location. We proceed with deletion from
|
||||
// remote storage.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
|
||||
|
||||
// unwrap is safe because we used MustExist mode when acquiring
|
||||
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
||||
TenantSlot::Attached(tenant) => tenant.clone(),
|
||||
_ => {
|
||||
// Express "not attached" as equivalent to "not found"
|
||||
return Err(DeleteTenantError::NotAttached);
|
||||
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
match &slot_guard.old_value {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
// Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
|
||||
// deletion will be resumed across restarts.
|
||||
let tenant = tenant.clone();
|
||||
return self
|
||||
.delete_tenant_attached(slot_guard, tenant, activation_timeout)
|
||||
.await;
|
||||
}
|
||||
Some(TenantSlot::Secondary(secondary_tenant)) => {
|
||||
secondary_tenant.shutdown().await;
|
||||
let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
|
||||
let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("local tenant directory {local_tenant_directory:?} rename")
|
||||
})?;
|
||||
spawn_background_purge(tmp_dir);
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => unreachable!(),
|
||||
None => {}
|
||||
};
|
||||
|
||||
// Fall through: local state for this tenant is no longer present, proceed with remote delete
|
||||
let remote_path = remote_tenant_path(&tenant_shard_id);
|
||||
let keys = match self
|
||||
.resources
|
||||
.remote_storage
|
||||
.list(
|
||||
Some(&remote_path),
|
||||
remote_storage::ListingMode::NoDelimiter,
|
||||
None,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(listing) => listing.keys,
|
||||
Err(remote_storage::DownloadError::Cancelled) => {
|
||||
return Err(DeleteTenantError::Cancelled)
|
||||
}
|
||||
Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
|
||||
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
|
||||
};
|
||||
|
||||
if keys.is_empty() {
|
||||
tracing::info!("Remote storage already deleted");
|
||||
} else {
|
||||
tracing::info!("Deleting {} keys from remote storage", keys.len());
|
||||
self.resources
|
||||
.remote_storage
|
||||
.delete_objects(&keys, &self.cancel)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Callers use 404 as success for deletions, for historical reasons.
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
}
|
||||
|
||||
async fn delete_tenant_attached(
|
||||
&self,
|
||||
slot_guard: SlotGuard,
|
||||
tenant: Arc<Tenant>,
|
||||
activation_timeout: Duration,
|
||||
) -> Result<StatusCode, DeleteTenantError> {
|
||||
match tenant.current_state() {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// If deletion is already in progress, return success (the semantics of this
|
||||
@@ -1403,7 +1466,7 @@ impl TenantManager {
|
||||
// The `delete_progress` lock is held: deletion is already happening
|
||||
// in the bacckground
|
||||
slot_guard.revert();
|
||||
return Ok(());
|
||||
return Ok(StatusCode::ACCEPTED);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
@@ -1436,7 +1499,8 @@ impl TenantManager {
|
||||
|
||||
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
|
||||
slot_guard.revert();
|
||||
result
|
||||
let () = result?;
|
||||
Ok(StatusCode::ACCEPTED)
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
|
||||
@@ -91,8 +91,7 @@
|
||||
//!
|
||||
//! The *actual* remote state lags behind the *desired* remote state while
|
||||
//! there are in-flight operations.
|
||||
//! We keep track of the desired remote state in
|
||||
//! [`UploadQueueInitialized::latest_files`] and [`UploadQueueInitialized::latest_metadata`].
|
||||
//! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`].
|
||||
//! It is initialized based on the [`IndexPart`] that was passed during init
|
||||
//! and updated with every `schedule_*` function call.
|
||||
//! All this is necessary necessary to compute the future [`IndexPart`]s
|
||||
@@ -115,8 +114,7 @@
|
||||
//!
|
||||
//! # Completion
|
||||
//!
|
||||
//! Once an operation has completed, we update
|
||||
//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
|
||||
//! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately,
|
||||
//! and submit a request through the DeletionQueue to update
|
||||
//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
|
||||
//! validated that our generation is not stale. It is this visible value
|
||||
@@ -416,6 +414,7 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise.
|
||||
pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
|
||||
match &mut *self.upload_queue.lock().unwrap() {
|
||||
UploadQueue::Uninitialized => None,
|
||||
@@ -442,13 +441,11 @@ impl RemoteTimelineClient {
|
||||
/// Returns true if this timeline was previously detached at this Lsn and the remote timeline
|
||||
/// client is currently initialized.
|
||||
pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
|
||||
// technically this is a dirty read, but given how timeline detach ancestor is implemented
|
||||
// via tenant restart, the lineage has always been uploaded.
|
||||
self.upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.initialized_mut()
|
||||
.map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
|
||||
.map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
@@ -457,7 +454,6 @@ impl RemoteTimelineClient {
|
||||
current_remote_index_part
|
||||
.layer_metadata
|
||||
.values()
|
||||
// If we don't have the file size for the layer, don't account for it in the metric.
|
||||
.map(|ilmd| ilmd.file_size)
|
||||
.sum()
|
||||
} else {
|
||||
@@ -585,9 +581,9 @@ impl RemoteTimelineClient {
|
||||
|
||||
// As documented in the struct definition, it's ok for latest_metadata to be
|
||||
// ahead of what's _actually_ on the remote during index upload.
|
||||
upload_queue.latest_metadata = metadata.clone();
|
||||
upload_queue.dirty.metadata = metadata.clone();
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -606,9 +602,9 @@ impl RemoteTimelineClient {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
upload_queue.latest_metadata.apply(update);
|
||||
upload_queue.dirty.metadata.apply(update);
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -620,8 +616,8 @@ impl RemoteTimelineClient {
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
upload_queue.last_aux_file_policy = last_aux_file_policy;
|
||||
self.schedule_index_upload(upload_queue);
|
||||
upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Ok(())
|
||||
}
|
||||
///
|
||||
@@ -639,30 +635,44 @@ impl RemoteTimelineClient {
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background (internal function)
|
||||
fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
fn schedule_index_upload(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
) -> anyhow::Result<()> {
|
||||
let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
|
||||
// fix up the duplicated field
|
||||
upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn;
|
||||
|
||||
// make sure it serializes before doing it in perform_upload_task so that it doesn't
|
||||
// look like a retryable error
|
||||
let void = std::io::sink();
|
||||
serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?;
|
||||
|
||||
let index_part = &upload_queue.dirty;
|
||||
|
||||
info!(
|
||||
"scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
|
||||
upload_queue.latest_files.len(),
|
||||
index_part.layer_metadata.len(),
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
);
|
||||
|
||||
let index_part = IndexPart::from(&*upload_queue);
|
||||
let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
|
||||
let op = UploadOp::UploadMetadata {
|
||||
uploaded: Box::new(index_part.clone()),
|
||||
};
|
||||
self.metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
|
||||
|
||||
// Launch the task immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn schedule_reparenting_and_wait(
|
||||
@@ -675,16 +685,16 @@ impl RemoteTimelineClient {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
|
||||
let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else {
|
||||
return Err(anyhow::anyhow!(
|
||||
"cannot reparent without a current ancestor"
|
||||
));
|
||||
};
|
||||
|
||||
upload_queue.latest_metadata.reparent(new_parent);
|
||||
upload_queue.latest_lineage.record_previous_ancestor(&prev);
|
||||
upload_queue.dirty.metadata.reparent(new_parent);
|
||||
upload_queue.dirty.lineage.record_previous_ancestor(&prev);
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
|
||||
self.schedule_barrier0(upload_queue)
|
||||
};
|
||||
@@ -705,16 +715,17 @@ impl RemoteTimelineClient {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
upload_queue.latest_metadata.detach_from_ancestor(&adopted);
|
||||
upload_queue.latest_lineage.record_detaching(&adopted);
|
||||
upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
|
||||
upload_queue.dirty.lineage.record_detaching(&adopted);
|
||||
|
||||
for layer in layers {
|
||||
upload_queue
|
||||
.latest_files
|
||||
.dirty
|
||||
.layer_metadata
|
||||
.insert(layer.layer_desc().layer_name(), layer.metadata());
|
||||
}
|
||||
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
|
||||
let barrier = self.schedule_barrier0(upload_queue);
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
@@ -746,7 +757,8 @@ impl RemoteTimelineClient {
|
||||
let metadata = layer.metadata();
|
||||
|
||||
upload_queue
|
||||
.latest_files
|
||||
.dirty
|
||||
.layer_metadata
|
||||
.insert(layer.layer_desc().layer_name(), metadata.clone());
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
|
||||
@@ -776,8 +788,8 @@ impl RemoteTimelineClient {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
let with_metadata =
|
||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
|
||||
let with_metadata = self
|
||||
.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?;
|
||||
|
||||
self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);
|
||||
|
||||
@@ -801,7 +813,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
|
||||
|
||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
|
||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
|
||||
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
|
||||
@@ -814,7 +826,7 @@ impl RemoteTimelineClient {
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
names: I,
|
||||
) -> Vec<(LayerName, LayerFileMetadata)>
|
||||
) -> anyhow::Result<Vec<(LayerName, LayerFileMetadata)>>
|
||||
where
|
||||
I: IntoIterator<Item = LayerName>,
|
||||
{
|
||||
@@ -824,7 +836,7 @@ impl RemoteTimelineClient {
|
||||
let with_metadata: Vec<_> = names
|
||||
.into_iter()
|
||||
.filter_map(|name| {
|
||||
let meta = upload_queue.latest_files.remove(&name);
|
||||
let meta = upload_queue.dirty.layer_metadata.remove(&name);
|
||||
|
||||
if let Some(meta) = meta {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
@@ -856,10 +868,10 @@ impl RemoteTimelineClient {
|
||||
// index_part update, because that needs to be uploaded before we can actually delete the
|
||||
// files.
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue);
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
}
|
||||
|
||||
with_metadata
|
||||
Ok(with_metadata)
|
||||
}
|
||||
|
||||
/// Schedules deletion for layer files which have previously been unlinked from the
|
||||
@@ -950,7 +962,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
|
||||
|
||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
|
||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
|
||||
Ok(())
|
||||
@@ -1085,7 +1097,7 @@ impl RemoteTimelineClient {
|
||||
let deleted_at = Utc::now().naive_utc();
|
||||
stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
|
||||
|
||||
let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
|
||||
let mut index_part = stopped.upload_queue_for_deletion.dirty.clone();
|
||||
index_part.deleted_at = Some(deleted_at);
|
||||
index_part
|
||||
};
|
||||
@@ -1296,7 +1308,8 @@ impl RemoteTimelineClient {
|
||||
|
||||
stopped
|
||||
.upload_queue_for_deletion
|
||||
.latest_files
|
||||
.dirty
|
||||
.layer_metadata
|
||||
.drain()
|
||||
.map(|(file_name, meta)| {
|
||||
remote_layer_path(
|
||||
@@ -1433,7 +1446,7 @@ impl RemoteTimelineClient {
|
||||
// Can always be scheduled.
|
||||
true
|
||||
}
|
||||
UploadOp::UploadMetadata(_, _) => {
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
// These can only be performed after all the preceding operations
|
||||
// have finished.
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
@@ -1475,7 +1488,7 @@ impl RemoteTimelineClient {
|
||||
UploadOp::UploadLayer(_, _) => {
|
||||
upload_queue.num_inprogress_layer_uploads += 1;
|
||||
}
|
||||
UploadOp::UploadMetadata(_, _) => {
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
upload_queue.num_inprogress_metadata_uploads += 1;
|
||||
}
|
||||
UploadOp::Delete(_) => {
|
||||
@@ -1584,22 +1597,13 @@ impl RemoteTimelineClient {
|
||||
)
|
||||
.await
|
||||
}
|
||||
UploadOp::UploadMetadata(ref index_part, _lsn) => {
|
||||
let mention_having_future_layers = if cfg!(feature = "testing") {
|
||||
index_part
|
||||
.layer_metadata
|
||||
.keys()
|
||||
.any(|x| x.is_in_future(*_lsn))
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
UploadOp::UploadMetadata { ref uploaded } => {
|
||||
let res = upload::upload_index_part(
|
||||
&self.storage_impl,
|
||||
&self.tenant_shard_id,
|
||||
&self.timeline_id,
|
||||
self.generation,
|
||||
index_part,
|
||||
uploaded,
|
||||
&self.cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -1609,10 +1613,21 @@ impl RemoteTimelineClient {
|
||||
)
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
self.update_remote_physical_size_gauge(Some(uploaded));
|
||||
let mention_having_future_layers = if cfg!(feature = "testing") {
|
||||
uploaded
|
||||
.layer_metadata
|
||||
.keys()
|
||||
.any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn()))
|
||||
} else {
|
||||
false
|
||||
};
|
||||
if mention_having_future_layers {
|
||||
// find rationale near crate::tenant::timeline::init::cleanup_future_layer
|
||||
tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
|
||||
tracing::info!(
|
||||
disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(),
|
||||
"uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup"
|
||||
);
|
||||
}
|
||||
}
|
||||
res
|
||||
@@ -1713,11 +1728,23 @@ impl RemoteTimelineClient {
|
||||
upload_queue.num_inprogress_layer_uploads -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => {
|
||||
UploadOp::UploadMetadata { ref uploaded } => {
|
||||
upload_queue.num_inprogress_metadata_uploads -= 1;
|
||||
// XXX monotonicity check?
|
||||
|
||||
upload_queue.projected_remote_consistent_lsn = Some(lsn);
|
||||
// the task id is reused as a monotonicity check for storing the "clean"
|
||||
// IndexPart.
|
||||
let last_updater = upload_queue.clean.1;
|
||||
let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id);
|
||||
let monotone = is_later || last_updater.is_none();
|
||||
|
||||
assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id);
|
||||
|
||||
// not taking ownership is wasteful
|
||||
upload_queue.clean.0.clone_from(uploaded);
|
||||
upload_queue.clean.1 = Some(task.task_id);
|
||||
|
||||
let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
|
||||
|
||||
if self.generation.is_none() {
|
||||
// Legacy mode: skip validating generation
|
||||
upload_queue.visible_remote_consistent_lsn.store(lsn);
|
||||
@@ -1771,7 +1798,7 @@ impl RemoteTimelineClient {
|
||||
RemoteOpKind::Upload,
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
|
||||
),
|
||||
UploadOp::UploadMetadata(_, _) => (
|
||||
UploadOp::UploadMetadata { .. } => (
|
||||
RemoteOpFileKind::Index,
|
||||
RemoteOpKind::Upload,
|
||||
DontTrackSize {
|
||||
@@ -1847,11 +1874,9 @@ impl RemoteTimelineClient {
|
||||
// Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
|
||||
let upload_queue_for_deletion = UploadQueueInitialized {
|
||||
task_counter: 0,
|
||||
latest_files: initialized.latest_files.clone(),
|
||||
dirty: initialized.dirty.clone(),
|
||||
clean: initialized.clean.clone(),
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: initialized.latest_metadata.clone(),
|
||||
latest_lineage: initialized.latest_lineage.clone(),
|
||||
projected_remote_consistent_lsn: None,
|
||||
visible_remote_consistent_lsn: initialized
|
||||
.visible_remote_consistent_lsn
|
||||
.clone(),
|
||||
@@ -1864,7 +1889,6 @@ impl RemoteTimelineClient {
|
||||
dangling_files: HashMap::default(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
last_aux_file_policy: initialized.last_aux_file_policy,
|
||||
};
|
||||
|
||||
let upload_queue = std::mem::replace(
|
||||
|
||||
@@ -28,6 +28,7 @@ use crate::TEMP_FILE_SUFFIX;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::pausable_failpoint;
|
||||
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::{
|
||||
@@ -152,6 +153,8 @@ async fn download_object<'a>(
|
||||
|
||||
let download = storage.download(src_path, cancel).await?;
|
||||
|
||||
pausable_failpoint!("before-downloading-layer-stream-pausable");
|
||||
|
||||
let mut buf_writer =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
|
||||
@@ -199,6 +202,8 @@ async fn download_object<'a>(
|
||||
|
||||
let mut download = storage.download(src_path, cancel).await?;
|
||||
|
||||
pausable_failpoint!("before-downloading-layer-stream-pausable");
|
||||
|
||||
// TODO: use vectored write (writev) once supported by tokio-epoll-uring.
|
||||
// There's chunks_vectored() on the stream.
|
||||
let (bytes_amount, destination_file) = async {
|
||||
|
||||
@@ -11,7 +11,6 @@ use utils::id::TimelineId;
|
||||
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::upload_queue::UploadQueueInitialized;
|
||||
use crate::tenant::Generation;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
|
||||
@@ -39,12 +38,19 @@ pub struct IndexPart {
|
||||
/// that latest version stores.
|
||||
pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated for convenience when reading the serialized structure, but is
|
||||
// private because internally we would read from metadata instead.
|
||||
disk_consistent_lsn: Lsn,
|
||||
/// Because of the trouble of eyeballing the legacy "metadata" field, we copied the
|
||||
/// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be
|
||||
/// reused.
|
||||
pub(super) disk_consistent_lsn: Lsn,
|
||||
|
||||
#[serde(rename = "metadata_bytes")]
|
||||
// TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding
|
||||
// the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes"
|
||||
// for backwards compatibility.
|
||||
#[serde(
|
||||
rename = "metadata_bytes",
|
||||
alias = "metadata",
|
||||
with = "crate::tenant::metadata::modern_serde"
|
||||
)]
|
||||
pub metadata: TimelineMetadata,
|
||||
|
||||
#[serde(default)]
|
||||
@@ -73,40 +79,33 @@ impl IndexPart {
|
||||
/// - 4: timeline_layers is fully removed.
|
||||
/// - 5: lineage was added
|
||||
/// - 6: last_aux_file_policy is added.
|
||||
const LATEST_VERSION: usize = 6;
|
||||
/// - 7: metadata_bytes is no longer written, but still read
|
||||
const LATEST_VERSION: usize = 7;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
fn new(
|
||||
layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata: TimelineMetadata,
|
||||
lineage: Lineage,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> Self {
|
||||
let layer_metadata = layers_and_metadata.clone();
|
||||
|
||||
Self {
|
||||
pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
|
||||
IndexPart {
|
||||
version: Self::LATEST_VERSION,
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
layer_metadata: Default::default(),
|
||||
disk_consistent_lsn: metadata.disk_consistent_lsn(),
|
||||
metadata,
|
||||
deleted_at: None,
|
||||
lineage,
|
||||
last_aux_file_policy,
|
||||
lineage: Default::default(),
|
||||
last_aux_file_policy: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_version(&self) -> usize {
|
||||
pub fn version(&self) -> usize {
|
||||
self.version
|
||||
}
|
||||
|
||||
/// If you want this under normal operations, read it from self.metadata:
|
||||
/// this method is just for the scrubber to use when validating an index.
|
||||
pub fn get_disk_consistent_lsn(&self) -> Lsn {
|
||||
pub fn duplicated_disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn
|
||||
}
|
||||
|
||||
@@ -120,14 +119,7 @@ impl IndexPart {
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn example() -> Self {
|
||||
let example_metadata = TimelineMetadata::example();
|
||||
Self::new(
|
||||
&HashMap::new(),
|
||||
example_metadata.disk_consistent_lsn(),
|
||||
example_metadata,
|
||||
Default::default(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
)
|
||||
Self::empty(TimelineMetadata::example())
|
||||
}
|
||||
|
||||
pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
|
||||
@@ -135,22 +127,6 @@ impl IndexPart {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&UploadQueueInitialized> for IndexPart {
|
||||
fn from(uq: &UploadQueueInitialized) -> Self {
|
||||
let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
|
||||
let metadata = uq.latest_metadata.clone();
|
||||
let lineage = uq.latest_lineage.clone();
|
||||
|
||||
Self::new(
|
||||
&uq.latest_files,
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
lineage,
|
||||
uq.last_aux_file_policy,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
@@ -236,19 +212,18 @@ impl Lineage {
|
||||
/// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
|
||||
/// to start a read/write primary at this lsn".
|
||||
///
|
||||
/// Returns true if the Lsn was previously a branch point.
|
||||
/// Returns true if the Lsn was previously our branch point.
|
||||
pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
|
||||
self.original_ancestor
|
||||
.as_ref()
|
||||
.is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
|
||||
.is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::*;
|
||||
use std::str::FromStr;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
#[test]
|
||||
fn v1_indexpart_is_parsed() {
|
||||
@@ -367,8 +342,7 @@ mod tests {
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
@@ -544,8 +518,7 @@ mod tests {
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
lineage: Lineage {
|
||||
reparenting_history_truncated: false,
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
@@ -558,6 +531,60 @@ mod tests {
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v7_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"version": 7,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata": {
|
||||
"disk_consistent_lsn": "0/16960E8",
|
||||
"prev_record_lsn": "0/1696070",
|
||||
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/1696070",
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"deleted_at": "2023-07-31T09:00:00.123"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 7,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::new(
|
||||
Lsn::from_str("0/16960E8").unwrap(),
|
||||
Some(Lsn::from_str("0/1696070").unwrap()),
|
||||
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
lineage: Default::default(),
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
fn parse_naive_datetime(s: &str) -> NaiveDateTime {
|
||||
chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! Helper functions to upload files to remote storage with a RemoteStorage
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use fail::fail_point;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -11,10 +12,10 @@ use tokio::io::AsyncSeekExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, pausable_failpoint};
|
||||
|
||||
use super::index::IndexPart;
|
||||
use super::Generation;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path,
|
||||
remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
|
||||
};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -27,7 +28,7 @@ pub(crate) async fn upload_index_part<'a>(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
generation: Generation,
|
||||
index_part: &'a IndexPart,
|
||||
index_part: &IndexPart,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::trace!("uploading new index part");
|
||||
@@ -37,16 +38,16 @@ pub(crate) async fn upload_index_part<'a>(
|
||||
});
|
||||
pausable_failpoint!("before-upload-index-pausable");
|
||||
|
||||
let index_part_bytes = index_part
|
||||
.to_s3_bytes()
|
||||
.context("serialize index part file into bytes")?;
|
||||
let index_part_size = index_part_bytes.len();
|
||||
let index_part_bytes = bytes::Bytes::from(index_part_bytes);
|
||||
// FIXME: this error comes too late
|
||||
let serialized = index_part.to_s3_bytes()?;
|
||||
let serialized = Bytes::from(serialized);
|
||||
|
||||
let index_part_size = serialized.len();
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
|
||||
storage
|
||||
.upload_storage_object(
|
||||
futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
|
||||
futures::stream::once(futures::future::ready(Ok(serialized))),
|
||||
index_part_size,
|
||||
&remote_path,
|
||||
cancel,
|
||||
|
||||
@@ -1000,7 +1000,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
layer.name,
|
||||
layer.metadata.file_size
|
||||
);
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
let downloaded_bytes = download_layer_file(
|
||||
self.conf,
|
||||
self.remote_storage,
|
||||
*tenant_shard_id,
|
||||
@@ -1011,8 +1011,9 @@ impl<'a> TenantDownloader<'a> {
|
||||
&self.secondary_state.cancel,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
.await;
|
||||
|
||||
let downloaded_bytes = match downloaded_bytes {
|
||||
Ok(bytes) => bytes,
|
||||
Err(DownloadError::NotFound) => {
|
||||
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
|
||||
|
||||
@@ -334,8 +334,11 @@ where
|
||||
|
||||
let tenant_shard_id = job.get_tenant_shard_id();
|
||||
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||
tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Command already running, waiting for it");
|
||||
tracing::info!(
|
||||
tenant_id=%tenant_shard_id.tenant_id,
|
||||
shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Command already running, waiting for it"
|
||||
);
|
||||
barrier
|
||||
} else {
|
||||
let running = self.spawn_now(job);
|
||||
|
||||
@@ -478,6 +478,23 @@ impl DeltaLayerWriterInner {
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let temp_path = self.path.clone();
|
||||
let result = self.finish0(key_end, timeline, ctx).await;
|
||||
if result.is_err() {
|
||||
tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
async fn finish0(
|
||||
self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
@@ -651,19 +668,11 @@ impl DeltaLayerWriter {
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let inner = self.inner.take().unwrap();
|
||||
let temp_path = inner.path.clone();
|
||||
let result = inner.finish(key_end, timeline, ctx).await;
|
||||
// The delta layer files can sometimes be really large. Clean them up.
|
||||
if result.is_err() {
|
||||
tracing::warn!(
|
||||
"Cleaning up temporary delta file {temp_path} after error during writing"
|
||||
);
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
|
||||
}
|
||||
}
|
||||
result
|
||||
self.inner
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(key_end, timeline, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -920,6 +929,45 @@ impl DeltaLayerInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
block_reader,
|
||||
);
|
||||
let mut result = Vec::new();
|
||||
let mut stream =
|
||||
Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
while let Some(item) = stream.next().await {
|
||||
let (key, lsn, pos) = item?;
|
||||
// TODO: dedup code with get_reconstruct_value
|
||||
// TODO: ctx handling and sharding
|
||||
cursor
|
||||
.read_blob_into_buf(pos.pos(), &mut buf, ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to read blob from virtual file {}", self.file.path)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
self.file.path
|
||||
)
|
||||
})?;
|
||||
result.push((key, lsn, val));
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn plan_reads<Reader>(
|
||||
keyspace: &KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
@@ -485,6 +485,34 @@ impl ImageLayerInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
let mut result = Vec::new();
|
||||
let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
while let Some(item) = stream.next().await {
|
||||
// TODO: dedup code with get_reconstruct_value
|
||||
let (raw_key, offset) = item?;
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
// TODO: ctx handling and sharding
|
||||
let blob = cursor
|
||||
.read_blob(offset, ctx)
|
||||
.await
|
||||
.with_context(|| format!("failed to read value from offset {}", offset))?;
|
||||
let value = Bytes::from(blob);
|
||||
result.push((key, self.lsn, Value::Image(value)));
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Traverse the layer's index to build read operations on the overlap of the input keyspace
|
||||
/// and the keys in this layer.
|
||||
///
|
||||
@@ -917,26 +945,57 @@ impl Drop for ImageLayerWriter {
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::time::Duration;
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
|
||||
};
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
|
||||
use crate::{
|
||||
tenant::{config::TenantConf, harness::TenantHarness},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
use super::ImageLayerWriter;
|
||||
|
||||
#[tokio::test]
|
||||
async fn image_layer_rewrite() {
|
||||
let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tenant_conf = TenantConf {
|
||||
gc_period: Duration::ZERO,
|
||||
compaction_period: Duration::ZERO,
|
||||
..TenantConf::default()
|
||||
};
|
||||
let tenant_id = TenantId::generate();
|
||||
let mut gen = Generation::new(0xdead0001);
|
||||
let mut get_next_gen = || {
|
||||
let ret = gen;
|
||||
gen = gen.next();
|
||||
ret
|
||||
};
|
||||
// The LSN at which we will create an image layer to filter
|
||||
let lsn = Lsn(0xdeadbeef0000);
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
//
|
||||
// Create an unsharded parent with a layer.
|
||||
//
|
||||
|
||||
let harness = TenantHarness::create_custom(
|
||||
"test_image_layer_rewrite--parent",
|
||||
tenant_conf.clone(),
|
||||
tenant_id,
|
||||
ShardIdentity::unsharded(),
|
||||
get_next_gen(),
|
||||
)
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
@@ -971,9 +1030,47 @@ mod test {
|
||||
};
|
||||
let original_size = resident.metadata().file_size;
|
||||
|
||||
//
|
||||
// Create child shards and do the rewrite, exercising filter().
|
||||
// TODO: abstraction in TenantHarness for splits.
|
||||
//
|
||||
|
||||
// Filter for various shards: this exercises cases like values at start of key range, end of key
|
||||
// range, middle of key range.
|
||||
for shard_number in 0..4 {
|
||||
let shard_count = ShardCount::new(4);
|
||||
for shard_number in 0..shard_count.count() {
|
||||
//
|
||||
// mimic the shard split
|
||||
//
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
shard_count,
|
||||
ShardStripeSize(0x8000),
|
||||
)
|
||||
.unwrap();
|
||||
let harness = TenantHarness::create_custom(
|
||||
Box::leak(Box::new(format!(
|
||||
"test_image_layer_rewrite--child{}",
|
||||
shard_identity.shard_slug()
|
||||
))),
|
||||
tenant_conf.clone(),
|
||||
tenant_id,
|
||||
shard_identity,
|
||||
// NB: in reality, the shards would each fork off their own gen number sequence from the parent.
|
||||
// But here, all we care about is that the gen number is unique.
|
||||
get_next_gen(),
|
||||
)
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
//
|
||||
// use filter() and make assertions
|
||||
//
|
||||
|
||||
let mut filtered_writer = ImageLayerWriter::new(
|
||||
harness.conf,
|
||||
timeline_id,
|
||||
@@ -985,15 +1082,6 @@ mod test {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
|
||||
// to exercise filter()
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
ShardCount::new(4),
|
||||
ShardStripeSize(0x8000),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let wrote_keys = resident
|
||||
.filter(&shard_identity, &mut filtered_writer, &ctx)
|
||||
.await
|
||||
|
||||
@@ -52,7 +52,7 @@ pub struct InMemoryLayer {
|
||||
|
||||
/// Frozen layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is `None`.
|
||||
end_lsn: OnceLock<Lsn>,
|
||||
pub(crate) end_lsn: OnceLock<Lsn>,
|
||||
|
||||
/// Used for traversal path. Cached representation of the in-memory layer before frozen.
|
||||
local_path_str: Arc<str>,
|
||||
|
||||
@@ -277,9 +277,10 @@ impl Layer {
|
||||
|
||||
let downloaded = resident.expect("just initialized");
|
||||
|
||||
// if the rename works, the path is as expected
|
||||
// TODO: sync system call
|
||||
std::fs::rename(temp_path, owner.local_path())
|
||||
// We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`.
|
||||
// TODO: this leaves the temp file in place if the rename fails, risking us running
|
||||
// out of space. Should we clean it up here or does the calling context deal with this?
|
||||
utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path())
|
||||
.with_context(|| format!("rename temporary file as correct path for {owner}"))?;
|
||||
|
||||
Ok(ResidentLayer { downloaded, owner })
|
||||
@@ -387,6 +388,23 @@ impl Layer {
|
||||
})
|
||||
}
|
||||
|
||||
/// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
|
||||
let layer = self
|
||||
.0
|
||||
.get_or_maybe_download(true, Some(ctx))
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
layer.load_key_values(&self.0, ctx).await
|
||||
}
|
||||
|
||||
/// Download the layer if evicted.
|
||||
///
|
||||
/// Will not error when the layer is already downloaded.
|
||||
@@ -1756,6 +1774,20 @@ impl DownloadedLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn load_key_values(
|
||||
&self,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self.get(owner, ctx).await? {
|
||||
Delta(d) => d.load_key_values(ctx).await,
|
||||
Image(i) => i.load_key_values(ctx).await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
use LayerKind::*;
|
||||
match self.get(owner, ctx).await? {
|
||||
|
||||
@@ -815,6 +815,7 @@ async fn eviction_cancellation_on_drop() {
|
||||
/// A test case to remind you the cost of these structures. You can bump the size limit
|
||||
/// below if it is really necessary to add more fields to the structures.
|
||||
#[test]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn layer_size() {
|
||||
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
|
||||
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
pub(crate) mod analysis;
|
||||
mod compaction;
|
||||
pub mod delete;
|
||||
pub(crate) mod detach_ancestor;
|
||||
@@ -102,7 +103,6 @@ use crate::metrics::{
|
||||
};
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
|
||||
@@ -322,6 +322,8 @@ pub struct Timeline {
|
||||
/// Locked automatically by [`TimelineWriter`] and checkpointer.
|
||||
/// Must always be acquired before the layer map/individual layer lock
|
||||
/// to avoid deadlock.
|
||||
///
|
||||
/// The state is cleared upon freezing.
|
||||
write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
|
||||
|
||||
/// Used to avoid multiple `flush_loop` tasks running
|
||||
@@ -1569,7 +1571,15 @@ impl Timeline {
|
||||
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
|
||||
// polluting the span hierarchy.
|
||||
pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
|
||||
let to_lsn = self.freeze_inmem_layer(false).await;
|
||||
let to_lsn = {
|
||||
// Freeze the current open in-memory layer. It will be written to disk on next
|
||||
// iteration.
|
||||
let mut g = self.write_lock.lock().await;
|
||||
|
||||
let to_lsn = self.get_last_record_lsn();
|
||||
self.freeze_inmem_layer_at(to_lsn, &mut g).await;
|
||||
to_lsn
|
||||
};
|
||||
self.flush_frozen_layers_and_wait(to_lsn).await
|
||||
}
|
||||
|
||||
@@ -1578,7 +1588,7 @@ impl Timeline {
|
||||
// an ephemeral layer open forever when idle. It also freezes layers if the global limit on
|
||||
// ephemeral layer bytes has been breached.
|
||||
pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
|
||||
let Ok(_write_guard) = self.write_lock.try_lock() else {
|
||||
let Ok(mut write_guard) = self.write_lock.try_lock() else {
|
||||
// If the write lock is held, there is an active wal receiver: rolling open layers
|
||||
// is their responsibility while they hold this lock.
|
||||
return;
|
||||
@@ -1655,24 +1665,35 @@ impl Timeline {
|
||||
self.last_freeze_at.load(),
|
||||
open_layer.get_opened_at(),
|
||||
) {
|
||||
match open_layer.info() {
|
||||
let at_lsn = match open_layer.info() {
|
||||
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
|
||||
// We may reach this point if the layer was already frozen by not yet flushed: flushing
|
||||
// happens asynchronously in the background.
|
||||
tracing::debug!(
|
||||
"Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
|
||||
);
|
||||
None
|
||||
}
|
||||
InMemoryLayerInfo::Open { .. } => {
|
||||
// Upgrade to a write lock and freeze the layer
|
||||
drop(layers_guard);
|
||||
let mut layers_guard = self.layers.write().await;
|
||||
layers_guard
|
||||
.try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
|
||||
let froze = layers_guard
|
||||
.try_freeze_in_memory_layer(
|
||||
current_lsn,
|
||||
&self.last_freeze_at,
|
||||
&mut write_guard,
|
||||
)
|
||||
.await;
|
||||
Some(current_lsn).filter(|_| froze)
|
||||
}
|
||||
};
|
||||
if let Some(lsn) = at_lsn {
|
||||
let res: Result<u64, _> = self.flush_frozen_layers(lsn);
|
||||
if let Err(e) = res {
|
||||
tracing::info!("failed to flush frozen layer after background freeze: {e:#}");
|
||||
}
|
||||
}
|
||||
self.flush_frozen_layers();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2036,11 +2057,11 @@ impl Timeline {
|
||||
true
|
||||
} else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
|
||||
projected_lsn,
|
||||
layer_size,
|
||||
opened_at.elapsed()
|
||||
);
|
||||
"Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
|
||||
projected_lsn,
|
||||
layer_size,
|
||||
opened_at.elapsed()
|
||||
);
|
||||
|
||||
true
|
||||
} else {
|
||||
@@ -2381,7 +2402,7 @@ impl Timeline {
|
||||
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
|
||||
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
|
||||
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
|
||||
assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
|
||||
assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
|
||||
*flush_loop_state = FlushLoopState::Exited;
|
||||
Ok(())
|
||||
}
|
||||
@@ -2788,17 +2809,21 @@ impl Timeline {
|
||||
crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
|
||||
};
|
||||
|
||||
match self_ref
|
||||
let calculated_size = self_ref
|
||||
.logical_size_calculation_task(
|
||||
initial_part_end,
|
||||
LogicalSizeCalculationCause::Initial,
|
||||
background_ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
.await?;
|
||||
|
||||
self_ref
|
||||
.trigger_aux_file_size_computation(initial_part_end, background_ctx)
|
||||
.await?;
|
||||
|
||||
// TODO: add aux file size to logical size
|
||||
|
||||
Ok((calculated_size, metrics_guard))
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3191,7 +3216,7 @@ impl Timeline {
|
||||
|
||||
// Recurse into ancestor if needed
|
||||
if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
|
||||
if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
|
||||
if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
|
||||
trace!(
|
||||
"going into ancestor {}, cont_lsn is {}",
|
||||
timeline.ancestor_lsn,
|
||||
@@ -3640,28 +3665,21 @@ impl Timeline {
|
||||
self.last_record_lsn.advance(new_lsn);
|
||||
}
|
||||
|
||||
/// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
|
||||
/// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
|
||||
async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
|
||||
// Freeze the current open in-memory layer. It will be written to disk on next
|
||||
// iteration.
|
||||
|
||||
let _write_guard = if write_lock_held {
|
||||
None
|
||||
} else {
|
||||
Some(self.write_lock.lock().await)
|
||||
async fn freeze_inmem_layer_at(
|
||||
&self,
|
||||
at: Lsn,
|
||||
write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
|
||||
) {
|
||||
let frozen = {
|
||||
let mut guard = self.layers.write().await;
|
||||
guard
|
||||
.try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
|
||||
.await
|
||||
};
|
||||
|
||||
let to_lsn = self.get_last_record_lsn();
|
||||
self.freeze_inmem_layer_at(to_lsn).await;
|
||||
to_lsn
|
||||
}
|
||||
|
||||
async fn freeze_inmem_layer_at(&self, at: Lsn) {
|
||||
let mut guard = self.layers.write().await;
|
||||
guard
|
||||
.try_freeze_in_memory_layer(at, &self.last_freeze_at)
|
||||
.await;
|
||||
if frozen {
|
||||
let now = Instant::now();
|
||||
*(self.last_freeze_ts.write().unwrap()) = now;
|
||||
}
|
||||
}
|
||||
|
||||
/// Layer flusher task's main loop.
|
||||
@@ -3755,18 +3773,14 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
|
||||
/// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
|
||||
/// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk.
|
||||
/// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`].
|
||||
///
|
||||
/// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
|
||||
/// it means no data will be written between the top of the highest frozen layer and to_lsn,
|
||||
/// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
|
||||
async fn flush_frozen_layers_and_wait(
|
||||
&self,
|
||||
last_record_lsn: Lsn,
|
||||
) -> Result<(), FlushLayerError> {
|
||||
let mut rx = self.layer_flush_done_tx.subscribe();
|
||||
|
||||
/// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the
|
||||
/// case, it means no data will be written between the top of the highest frozen layer and
|
||||
/// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data
|
||||
/// locally for that part of the WAL.
|
||||
fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result<u64, FlushLayerError> {
|
||||
// Increment the flush cycle counter and wake up the flush task.
|
||||
// Remember the new value, so that when we listen for the flush
|
||||
// to finish, we know when the flush that we initiated has
|
||||
@@ -3781,13 +3795,18 @@ impl Timeline {
|
||||
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
|
||||
my_flush_request = *counter + 1;
|
||||
*counter = my_flush_request;
|
||||
*lsn = std::cmp::max(last_record_lsn, *lsn);
|
||||
*lsn = std::cmp::max(at_lsn, *lsn);
|
||||
});
|
||||
|
||||
Ok(my_flush_request)
|
||||
}
|
||||
|
||||
async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> {
|
||||
let mut rx = self.layer_flush_done_tx.subscribe();
|
||||
loop {
|
||||
{
|
||||
let (last_result_counter, last_result) = &*rx.borrow();
|
||||
if *last_result_counter >= my_flush_request {
|
||||
if *last_result_counter >= request {
|
||||
if let Err(err) = last_result {
|
||||
// We already logged the original error in
|
||||
// flush_loop. We cannot propagate it to the caller
|
||||
@@ -3814,12 +3833,9 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
fn flush_frozen_layers(&self) {
|
||||
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
|
||||
*counter += 1;
|
||||
|
||||
*lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
|
||||
});
|
||||
async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> {
|
||||
let token = self.flush_frozen_layers(at_lsn)?;
|
||||
self.wait_flush_completion(token).await
|
||||
}
|
||||
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
@@ -3880,22 +3896,25 @@ impl Timeline {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
// FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
|
||||
// This code path will not be hit during regression tests. After #7099 we have a single partition
|
||||
// with two key ranges. If someone wants to fix initdb optimization in the future, this might need
|
||||
// to be fixed.
|
||||
|
||||
// For metadata, always create delta layers.
|
||||
let delta_layer = if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single aux file keyspace"
|
||||
"currently sparse keyspace should only contain a single metadata keyspace"
|
||||
);
|
||||
let metadata_keyspace = &metadata_partition.parts[0];
|
||||
assert_eq!(
|
||||
metadata_keyspace.0.ranges.len(),
|
||||
1,
|
||||
"aux file keyspace should be a single range"
|
||||
);
|
||||
self.create_delta_layer(
|
||||
&frozen_layer,
|
||||
Some(metadata_keyspace.0.ranges[0].clone()),
|
||||
Some(
|
||||
metadata_keyspace.0.ranges.first().unwrap().start
|
||||
..metadata_keyspace.0.ranges.last().unwrap().end,
|
||||
),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -4262,7 +4281,7 @@ impl Timeline {
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
|
||||
if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() {
|
||||
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
@@ -4312,6 +4331,7 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
img_range: Range<Key>,
|
||||
mode: ImageLayerCreationMode,
|
||||
start: Key,
|
||||
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
|
||||
assert!(!matches!(mode, ImageLayerCreationMode::Initial));
|
||||
|
||||
@@ -4320,39 +4340,43 @@ impl Timeline {
|
||||
let data = self
|
||||
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
let (data, total_kb_retrieved, total_key_retrieved) = {
|
||||
let (data, total_kb_retrieved, total_keys_retrieved) = {
|
||||
let mut new_data = BTreeMap::new();
|
||||
let mut total_kb_retrieved = 0;
|
||||
let mut total_key_retrieved = 0;
|
||||
let mut total_keys_retrieved = 0;
|
||||
for (k, v) in data {
|
||||
let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
|
||||
total_kb_retrieved += KEY_SIZE + v.len();
|
||||
total_key_retrieved += 1;
|
||||
total_keys_retrieved += 1;
|
||||
new_data.insert(k, v);
|
||||
}
|
||||
(new_data, total_kb_retrieved / 1024, total_key_retrieved)
|
||||
(new_data, total_kb_retrieved / 1024, total_keys_retrieved)
|
||||
};
|
||||
let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
|
||||
let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
|
||||
|
||||
let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
|
||||
let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
|
||||
debug!(
|
||||
"generate image layers for metadata keys: trigger_generation={trigger_generation}, \
|
||||
delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
|
||||
total_key_retrieved={total_key_retrieved}"
|
||||
trigger_generation,
|
||||
delta_files_accessed,
|
||||
total_kb_retrieved,
|
||||
total_keys_retrieved,
|
||||
"generate metadata images"
|
||||
);
|
||||
|
||||
if !trigger_generation && mode == ImageLayerCreationMode::Try {
|
||||
return Ok(ImageLayerCreationOutcome {
|
||||
image: None,
|
||||
next_start_key: img_range.end,
|
||||
});
|
||||
}
|
||||
let has_keys = !data.is_empty();
|
||||
let mut wrote_any_image = false;
|
||||
for (k, v) in data {
|
||||
// Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
|
||||
// considers this situation properly.
|
||||
// if v.is_empty() {
|
||||
// continue;
|
||||
// }
|
||||
if v.is_empty() {
|
||||
// the key has been deleted, it does not need an image
|
||||
// in metadata keyspace, an empty image == tombstone
|
||||
continue;
|
||||
}
|
||||
wrote_any_image = true;
|
||||
|
||||
// No need to handle sharding b/c metadata keys are always on the 0-th shard.
|
||||
|
||||
@@ -4360,16 +4384,26 @@ impl Timeline {
|
||||
// on the normal data path either.
|
||||
image_layer_writer.put_image(k, v, ctx).await?;
|
||||
}
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: if has_keys {
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
Some(image_layer)
|
||||
} else {
|
||||
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
|
||||
None
|
||||
},
|
||||
next_start_key: img_range.end,
|
||||
})
|
||||
|
||||
if wrote_any_image {
|
||||
// Normal path: we have written some data into the new image layer for this
|
||||
// partition, so flush it to disk.
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: Some(image_layer),
|
||||
next_start_key: img_range.end,
|
||||
})
|
||||
} else {
|
||||
// Special case: the image layer may be empty if this is a sharded tenant and the
|
||||
// partition does not cover any keys owned by this shard. In this case, to ensure
|
||||
// we don't leave gaps between image layers, leave `start` where it is, so that the next
|
||||
// layer we write will cover the key range that we just scanned.
|
||||
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
|
||||
Ok(ImageLayerCreationOutcome {
|
||||
image: None,
|
||||
next_start_key: start,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
|
||||
@@ -4425,6 +4459,12 @@ impl Timeline {
|
||||
if mode == ImageLayerCreationMode::Initial {
|
||||
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
|
||||
}
|
||||
if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
|
||||
// Skip compaction if there are not enough updates. Metadata compaction will do a scan and
|
||||
// might mess up with evictions.
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
} else if let ImageLayerCreationMode::Try = mode {
|
||||
// check_for_image_layers = false -> skip
|
||||
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
|
||||
@@ -4479,6 +4519,7 @@ impl Timeline {
|
||||
ctx,
|
||||
img_range,
|
||||
mode,
|
||||
start,
|
||||
)
|
||||
.await?;
|
||||
start = next_start_key;
|
||||
@@ -5448,11 +5489,12 @@ impl Timeline {
|
||||
let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
|
||||
let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
|
||||
let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
|
||||
let max_lsn = Lsn(deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap().0 + 1);
|
||||
let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
|
||||
assert!(
|
||||
max_lsn <= last_record_lsn,
|
||||
"advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
|
||||
);
|
||||
let end_lsn = Lsn(max_lsn.0 + 1);
|
||||
if let Some(check_start_lsn) = check_start_lsn {
|
||||
assert!(min_lsn >= check_start_lsn);
|
||||
}
|
||||
@@ -5461,7 +5503,7 @@ impl Timeline {
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
min_key,
|
||||
min_lsn..max_lsn,
|
||||
min_lsn..end_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5477,10 +5519,56 @@ impl Timeline {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return all keys at the LSN in the image layers
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn inspect_image_layers(
|
||||
self: &Arc<Timeline>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Bytes)>> {
|
||||
let mut all_data = Vec::new();
|
||||
let guard = self.layers.read().await;
|
||||
for layer in guard.layer_map().iter_historic_layers() {
|
||||
if !layer.is_delta() && layer.image_layer_lsn() == lsn {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
let mut reconstruct_data = ValuesReconstructState::default();
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
KeySpace::single(Key::MIN..Key::MAX),
|
||||
lsn..Lsn(lsn.0 + 1),
|
||||
&mut reconstruct_data,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
for (k, v) in reconstruct_data.keys {
|
||||
all_data.push((k, v?.img.unwrap().1));
|
||||
}
|
||||
}
|
||||
}
|
||||
all_data.sort();
|
||||
Ok(all_data)
|
||||
}
|
||||
|
||||
/// Get all historic layer descriptors in the layer map
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn inspect_historic_layers(
|
||||
self: &Arc<Timeline>,
|
||||
) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
|
||||
let mut layers = Vec::new();
|
||||
let guard = self.layers.read().await;
|
||||
for layer in guard.layer_map().iter_historic_layers() {
|
||||
layers.push(layer.key());
|
||||
}
|
||||
Ok(layers)
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
|
||||
|
||||
/// Tracking writes ingestion does to a particular in-memory layer.
|
||||
///
|
||||
/// Cleared upon freezing a layer.
|
||||
struct TimelineWriterState {
|
||||
open_layer: Arc<InMemoryLayer>,
|
||||
current_size: u64,
|
||||
@@ -5521,12 +5609,6 @@ impl Deref for TimelineWriter<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TimelineWriter<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.write_guard.take();
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum OpenLayerAction {
|
||||
Roll,
|
||||
@@ -5609,17 +5691,16 @@ impl<'a> TimelineWriter<'a> {
|
||||
}
|
||||
|
||||
async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
|
||||
assert!(self.write_guard.is_some());
|
||||
|
||||
self.tl.freeze_inmem_layer_at(freeze_at).await;
|
||||
|
||||
let now = Instant::now();
|
||||
*(self.last_freeze_ts.write().unwrap()) = now;
|
||||
|
||||
self.tl.flush_frozen_layers();
|
||||
|
||||
let current_size = self.write_guard.as_ref().unwrap().current_size;
|
||||
if current_size > self.get_checkpoint_distance() {
|
||||
|
||||
// self.write_guard will be taken by the freezing
|
||||
self.tl
|
||||
.freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
|
||||
.await;
|
||||
|
||||
self.tl.flush_frozen_layers(freeze_at)?;
|
||||
|
||||
if current_size >= self.get_checkpoint_distance() * 2 {
|
||||
warn!("Flushed oversized open layer with size {}", current_size)
|
||||
}
|
||||
|
||||
@@ -5632,9 +5713,27 @@ impl<'a> TimelineWriter<'a> {
|
||||
return OpenLayerAction::Open;
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
if state.cached_last_freeze_at < self.tl.last_freeze_at.load() {
|
||||
// this check and assertion are not really needed because
|
||||
// LayerManager::try_freeze_in_memory_layer will always clear out the
|
||||
// TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there
|
||||
// is no TimelineWriterState.
|
||||
assert!(
|
||||
state.open_layer.end_lsn.get().is_some(),
|
||||
"our open_layer must be outdated"
|
||||
);
|
||||
|
||||
// this would be a memory leak waiting to happen because the in-memory layer always has
|
||||
// an index
|
||||
panic!("BUG: TimelineWriterState held on to frozen in-memory layer.");
|
||||
}
|
||||
|
||||
if state.prev_lsn == Some(lsn) {
|
||||
// Rolling mid LSN is not supported by downstream code.
|
||||
// Rolling mid LSN is not supported by [downstream code].
|
||||
// Hence, only roll at LSN boundaries.
|
||||
//
|
||||
// [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422
|
||||
return OpenLayerAction::None;
|
||||
}
|
||||
|
||||
|
||||
90
pageserver/src/tenant/timeline/analysis.rs
Normal file
90
pageserver/src/tenant/timeline/analysis.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
use std::{collections::BTreeSet, ops::Range};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
pub(crate) struct RangeAnalysis {
|
||||
start: String,
|
||||
end: String,
|
||||
has_image: bool,
|
||||
num_of_deltas_above_image: usize,
|
||||
total_num_of_deltas: usize,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
pub(crate) async fn perf_info(&self) -> Vec<RangeAnalysis> {
|
||||
// First, collect all split points of the layers.
|
||||
let mut split_points = BTreeSet::new();
|
||||
let mut delta_ranges = Vec::new();
|
||||
let mut image_ranges = Vec::new();
|
||||
|
||||
let all_layer_files = {
|
||||
let guard = self.layers.read().await;
|
||||
guard.all_persistent_layers()
|
||||
};
|
||||
let lsn = self.get_last_record_lsn();
|
||||
|
||||
for key in all_layer_files {
|
||||
split_points.insert(key.key_range.start);
|
||||
split_points.insert(key.key_range.end);
|
||||
if key.is_delta {
|
||||
delta_ranges.push((key.key_range.clone(), key.lsn_range.clone()));
|
||||
} else {
|
||||
image_ranges.push((key.key_range.clone(), key.lsn_range.start));
|
||||
}
|
||||
}
|
||||
|
||||
// For each split range, compute the estimated read amplification.
|
||||
let split_points = split_points.into_iter().collect::<Vec<_>>();
|
||||
|
||||
let mut result = Vec::new();
|
||||
|
||||
for i in 0..(split_points.len() - 1) {
|
||||
let start = split_points[i];
|
||||
let end = split_points[i + 1];
|
||||
// Find the latest image layer that contains the information.
|
||||
let mut maybe_image_layers = image_ranges
|
||||
.iter()
|
||||
// We insert split points for all image layers, and therefore a `contains` check for the start point should be enough.
|
||||
.filter(|(key_range, img_lsn)| key_range.contains(&start) && img_lsn <= &lsn)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
maybe_image_layers.sort_by(|a, b| a.1.cmp(&b.1));
|
||||
let image_layer = maybe_image_layers.last().cloned();
|
||||
let lsn_filter_start = image_layer
|
||||
.as_ref()
|
||||
.map(|(_, lsn)| *lsn)
|
||||
.unwrap_or(Lsn::INVALID);
|
||||
|
||||
fn overlaps_with(lsn_range_a: &Range<Lsn>, lsn_range_b: &Range<Lsn>) -> bool {
|
||||
!(lsn_range_a.end <= lsn_range_b.start || lsn_range_a.start >= lsn_range_b.end)
|
||||
}
|
||||
|
||||
let maybe_delta_layers = delta_ranges
|
||||
.iter()
|
||||
.filter(|(key_range, lsn_range)| {
|
||||
key_range.contains(&start) && overlaps_with(&(lsn_filter_start..lsn), lsn_range)
|
||||
})
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let pitr_delta_layers = delta_ranges
|
||||
.iter()
|
||||
.filter(|(key_range, _)| key_range.contains(&start))
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
result.push(RangeAnalysis {
|
||||
start: start.to_string(),
|
||||
end: end.to_string(),
|
||||
has_image: image_layer.is_some(),
|
||||
num_of_deltas_above_image: maybe_delta_layers.len(),
|
||||
total_num_of_deltas: pitr_delta_layers.len(),
|
||||
});
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
@@ -133,8 +133,7 @@ impl Timeline {
|
||||
},
|
||||
&image_ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
.await?;
|
||||
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
partitioning.parts.len()
|
||||
@@ -422,48 +421,6 @@ impl Timeline {
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
}
|
||||
|
||||
// This failpoint is used together with `test_duplicate_layers` integration test.
|
||||
// It returns the compaction result exactly the same layers as input to compaction.
|
||||
// We want to ensure that this will not cause any problem when updating the layer map
|
||||
// after the compaction is finished.
|
||||
//
|
||||
// Currently, there are two rare edge cases that will cause duplicated layers being
|
||||
// inserted.
|
||||
// 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
|
||||
// is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
|
||||
// map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
|
||||
// point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
|
||||
// and this causes an overwrite. This is acceptable because the content is the same, and we should do a
|
||||
// layer replace instead of the normal remove / upload process.
|
||||
// 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
|
||||
// size length. Compaction will likely create the same set of n files afterwards.
|
||||
//
|
||||
// This failpoint is a superset of both of the cases.
|
||||
if cfg!(feature = "testing") {
|
||||
let active = (|| {
|
||||
::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
|
||||
false
|
||||
})();
|
||||
|
||||
if active {
|
||||
let mut new_layers = Vec::with_capacity(level0_deltas.len());
|
||||
for delta in &level0_deltas {
|
||||
// we are just faking these layers as being produced again for this failpoint
|
||||
new_layers.push(
|
||||
delta
|
||||
.download_and_keep_resident()
|
||||
.await
|
||||
.context("download layer for failpoint")?,
|
||||
);
|
||||
}
|
||||
tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
|
||||
return Ok(CompactLevel0Phase1Result {
|
||||
new_layers,
|
||||
deltas_to_compact: level0_deltas,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Gather the files to compact in this iteration.
|
||||
//
|
||||
// Start with the oldest Level 0 delta file, and collect any other
|
||||
@@ -995,6 +952,178 @@ impl Timeline {
|
||||
adaptor.flush_updates().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// An experimental compaction building block that combines compaction with garbage collection.
|
||||
///
|
||||
/// The current implementation picks all delta + image layers that are below or intersecting with
|
||||
/// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
|
||||
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
|
||||
/// and create delta layers with all deltas >= gc horizon.
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
_cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
use crate::tenant::storage_layer::ValueReconstructState;
|
||||
// Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
|
||||
// The layer selection has the following properties:
|
||||
// 1. If a layer is in the selection, all layers below it are in the selection.
|
||||
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
|
||||
let (layer_selection, gc_cutoff) = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
|
||||
let mut selected_layers = Vec::new();
|
||||
// TODO: consider retain_lsns
|
||||
drop(gc_info);
|
||||
for desc in layers.iter_historic_layers() {
|
||||
if desc.get_lsn_range().start <= gc_cutoff {
|
||||
selected_layers.push(guard.get_from_desc(&desc));
|
||||
}
|
||||
}
|
||||
(selected_layers, gc_cutoff)
|
||||
};
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
let mut all_key_values = Vec::new();
|
||||
for layer in &layer_selection {
|
||||
all_key_values.extend(layer.load_key_values(ctx).await?);
|
||||
}
|
||||
// Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
|
||||
// image layers, make image appear later than delta.
|
||||
struct ValueWrapper<'a>(&'a crate::repository::Value);
|
||||
impl Ord for ValueWrapper<'_> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
use crate::repository::Value;
|
||||
use std::cmp::Ordering;
|
||||
match (self.0, other.0) {
|
||||
(Value::Image(_), Value::WalRecord(_)) => Ordering::Greater,
|
||||
(Value::WalRecord(_), Value::Image(_)) => Ordering::Less,
|
||||
_ => Ordering::Equal,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl PartialOrd for ValueWrapper<'_> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl PartialEq for ValueWrapper<'_> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == std::cmp::Ordering::Equal
|
||||
}
|
||||
}
|
||||
impl Eq for ValueWrapper<'_> {}
|
||||
all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
|
||||
(k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
|
||||
});
|
||||
let max_lsn = all_key_values
|
||||
.iter()
|
||||
.map(|(_, lsn, _)| lsn)
|
||||
.max()
|
||||
.copied()
|
||||
.unwrap()
|
||||
+ 1;
|
||||
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
|
||||
// Data of the same key.
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
|
||||
|
||||
/// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
|
||||
async fn flush_accumulated_states(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
|
||||
horizon: Lsn,
|
||||
) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
|
||||
let mut base_image = None;
|
||||
let mut keys_above_horizon = Vec::new();
|
||||
let mut delta_above_base_image = Vec::new();
|
||||
// We have a list of deltas/images. We want to create image layers while collect garbages.
|
||||
for (key, lsn, val) in accumulated_values.iter().rev() {
|
||||
if *lsn > horizon {
|
||||
keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both
|
||||
} else if *lsn <= horizon {
|
||||
match val {
|
||||
crate::repository::Value::Image(image) => {
|
||||
if lsn <= &horizon {
|
||||
base_image = Some((*lsn, image.clone()));
|
||||
break;
|
||||
}
|
||||
}
|
||||
crate::repository::Value::WalRecord(wal) => {
|
||||
delta_above_base_image.push((*lsn, wal.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
delta_above_base_image.reverse();
|
||||
keys_above_horizon.reverse();
|
||||
let state = ValueReconstructState {
|
||||
img: base_image,
|
||||
records: delta_above_base_image,
|
||||
};
|
||||
let img = tline.reconstruct_value(key, horizon, state).await?;
|
||||
Ok((keys_above_horizon, img))
|
||||
}
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
all_key_values.first().unwrap().0,
|
||||
gc_cutoff..max_lsn, // TODO: off by one?
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
|
||||
gc_cutoff,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for item @ (key, _, _) in &all_key_values {
|
||||
if &last_key == key {
|
||||
accumulated_values.push(item);
|
||||
} else {
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
|
||||
.await?;
|
||||
image_layer_writer.put_image(last_key, image, ctx).await?;
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
accumulated_values.clear();
|
||||
accumulated_values.push(item);
|
||||
last_key = *key;
|
||||
}
|
||||
}
|
||||
let (deltas, image) =
|
||||
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
|
||||
image_layer_writer.put_image(last_key, image, ctx).await?;
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
accumulated_values.clear();
|
||||
// TODO: split layers
|
||||
let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?;
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
// Step 3: Place back to the layer map.
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.finish_gc_compaction(
|
||||
&layer_selection,
|
||||
&[delta_layer.clone(), image_layer.clone()],
|
||||
&self.metrics,
|
||||
)
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct TimelineAdaptor {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use tracing::trace;
|
||||
@@ -20,6 +21,8 @@ use crate::{
|
||||
},
|
||||
};
|
||||
|
||||
use super::TimelineWriterState;
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct LayerManager {
|
||||
@@ -119,18 +122,20 @@ impl LayerManager {
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
||||
pub(crate) async fn try_freeze_in_memory_layer(
|
||||
/// Tries to freeze an open layer and also manages clearing the TimelineWriterState.
|
||||
///
|
||||
/// Returns true if anything was frozen.
|
||||
pub(super) async fn try_freeze_in_memory_layer(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
last_freeze_at: &AtomicLsn,
|
||||
) {
|
||||
write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
|
||||
) -> bool {
|
||||
let Lsn(last_record_lsn) = lsn;
|
||||
let end_lsn = Lsn(last_record_lsn + 1);
|
||||
|
||||
if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
let froze = if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
let open_layer_rc = Arc::clone(open_layer);
|
||||
// Does this layer need freezing?
|
||||
open_layer.freeze(end_lsn).await;
|
||||
|
||||
// The layer is no longer open, update the layer map to reflect this.
|
||||
@@ -138,11 +143,25 @@ impl LayerManager {
|
||||
self.layer_map.frozen_layers.push_back(open_layer_rc);
|
||||
self.layer_map.open_layer = None;
|
||||
self.layer_map.next_open_layer_at = Some(end_lsn);
|
||||
}
|
||||
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
// Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
|
||||
// accounts for regions in the LSN range where we might have ingested no data due to sharding.
|
||||
last_freeze_at.store(end_lsn);
|
||||
|
||||
// the writer state must no longer have a reference to the frozen layer
|
||||
let taken = write_lock.take();
|
||||
assert_eq!(
|
||||
froze,
|
||||
taken.is_some(),
|
||||
"should only had frozen a layer when TimelineWriterState existed"
|
||||
);
|
||||
|
||||
froze
|
||||
}
|
||||
|
||||
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||
@@ -207,6 +226,18 @@ impl LayerManager {
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Called when a GC-compaction is completed.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn finish_gc_compaction(
|
||||
&mut self,
|
||||
compact_from: &[Layer],
|
||||
compact_to: &[ResidentLayer],
|
||||
metrics: &TimelineMetrics,
|
||||
) {
|
||||
// We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification.
|
||||
self.finish_compact_l0(compact_from, compact_to, metrics)
|
||||
}
|
||||
|
||||
/// Called when compaction is completed.
|
||||
pub(crate) fn rewrite_layers(
|
||||
&mut self,
|
||||
@@ -308,6 +339,10 @@ impl LayerManager {
|
||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||
self.layer_fmgr.contains(layer)
|
||||
}
|
||||
|
||||
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
|
||||
self.layer_fmgr.0.keys().cloned().collect_vec()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
|
||||
|
||||
@@ -3,12 +3,10 @@ use super::storage_layer::ResidentLayer;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::remote_timeline_client::index::Lineage;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::Debug;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use std::sync::Arc;
|
||||
use tracing::info;
|
||||
use utils::lsn::AtomicLsn;
|
||||
@@ -45,34 +43,25 @@ pub(crate) struct UploadQueueInitialized {
|
||||
/// Counter to assign task IDs
|
||||
pub(crate) task_counter: u64,
|
||||
|
||||
/// All layer files stored in the remote storage, taking into account all
|
||||
/// in-progress and queued operations
|
||||
pub(crate) latest_files: HashMap<LayerName, LayerFileMetadata>,
|
||||
/// The next uploaded index_part.json; assumed to be dirty.
|
||||
///
|
||||
/// Should not be read, directly except for layer file updates. Instead you should add a
|
||||
/// projected field.
|
||||
pub(crate) dirty: IndexPart,
|
||||
|
||||
/// The latest remote persisted IndexPart.
|
||||
///
|
||||
/// Each completed metadata upload will update this. The second item is the task_id which last
|
||||
/// updated the value, used to ensure we never store an older value over a newer one.
|
||||
pub(crate) clean: (IndexPart, Option<u64>),
|
||||
|
||||
/// How many file uploads or deletions been scheduled, since the
|
||||
/// last (scheduling of) metadata index upload?
|
||||
pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,
|
||||
|
||||
/// Metadata stored in the remote storage, taking into account all
|
||||
/// in-progress and queued operations.
|
||||
/// DANGER: do not return to outside world, e.g., safekeepers.
|
||||
pub(crate) latest_metadata: TimelineMetadata,
|
||||
|
||||
/// Part of the flattened "next" `index_part.json`.
|
||||
pub(crate) latest_lineage: Lineage,
|
||||
|
||||
/// The last aux file policy used on this timeline.
|
||||
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
|
||||
/// `disk_consistent_lsn` from the last metadata file that was successfully
|
||||
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
|
||||
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
|
||||
/// Safekeeper can rely on it to make decisions for WAL storage.
|
||||
///
|
||||
/// visible_remote_consistent_lsn is only updated after our generation has been validated with
|
||||
/// The Lsn is only updated after our generation has been validated with
|
||||
/// the control plane (unlesss a timeline's generation is None, in which case
|
||||
/// we skip validation)
|
||||
pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
|
||||
pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
|
||||
|
||||
// Breakdown of different kinds of tasks currently in-progress
|
||||
@@ -118,7 +107,8 @@ impl UploadQueueInitialized {
|
||||
}
|
||||
|
||||
pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
|
||||
self.projected_remote_consistent_lsn
|
||||
let lsn = self.clean.0.metadata.disk_consistent_lsn();
|
||||
self.clean.1.map(|_| lsn)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -174,13 +164,12 @@ impl UploadQueue {
|
||||
|
||||
info!("initializing upload queue for empty remote");
|
||||
|
||||
let index_part = IndexPart::empty(metadata.clone());
|
||||
|
||||
let state = UploadQueueInitialized {
|
||||
// As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
|
||||
latest_files: HashMap::new(),
|
||||
dirty: index_part.clone(),
|
||||
clean: (index_part, None),
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: metadata.clone(),
|
||||
latest_lineage: Lineage::default(),
|
||||
projected_remote_consistent_lsn: None,
|
||||
visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
|
||||
// what follows are boring default initializations
|
||||
task_counter: 0,
|
||||
@@ -193,7 +182,6 @@ impl UploadQueue {
|
||||
dangling_files: HashMap::new(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
@@ -211,22 +199,15 @@ impl UploadQueue {
|
||||
}
|
||||
}
|
||||
|
||||
let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
|
||||
for (layer_name, layer_metadata) in &index_part.layer_metadata {
|
||||
files.insert(layer_name.to_owned(), layer_metadata.clone());
|
||||
}
|
||||
|
||||
info!(
|
||||
"initializing upload queue with remote index_part.disk_consistent_lsn: {}",
|
||||
index_part.metadata.disk_consistent_lsn()
|
||||
);
|
||||
|
||||
let state = UploadQueueInitialized {
|
||||
latest_files: files,
|
||||
dirty: index_part.clone(),
|
||||
clean: (index_part.clone(), None),
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: index_part.metadata.clone(),
|
||||
latest_lineage: index_part.lineage.clone(),
|
||||
projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
|
||||
visible_remote_consistent_lsn: Arc::new(
|
||||
index_part.metadata.disk_consistent_lsn().into(),
|
||||
),
|
||||
@@ -241,7 +222,6 @@ impl UploadQueue {
|
||||
dangling_files: HashMap::new(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
last_aux_file_policy: index_part.last_aux_file_policy(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
@@ -298,13 +278,16 @@ pub(crate) enum UploadOp {
|
||||
/// Upload a layer file
|
||||
UploadLayer(ResidentLayer, LayerFileMetadata),
|
||||
|
||||
/// Upload the metadata file
|
||||
UploadMetadata(Box<IndexPart>, Lsn),
|
||||
/// Upload a index_part.json file
|
||||
UploadMetadata {
|
||||
/// The next [`UploadQueueInitialized::clean`] after this upload succeeds.
|
||||
uploaded: Box<IndexPart>,
|
||||
},
|
||||
|
||||
/// Delete layer files
|
||||
Delete(Delete),
|
||||
|
||||
/// Barrier. When the barrier operation is reached,
|
||||
/// Barrier. When the barrier operation is reached, the channel is closed.
|
||||
Barrier(tokio::sync::watch::Sender<()>),
|
||||
|
||||
/// Shutdown; upon encountering this operation no new operations will be spawned, otherwise
|
||||
@@ -322,8 +305,12 @@ impl std::fmt::Display for UploadOp {
|
||||
layer, metadata.file_size, metadata.generation
|
||||
)
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => {
|
||||
write!(f, "UploadMetadata(lsn: {})", lsn)
|
||||
UploadOp::UploadMetadata { uploaded, .. } => {
|
||||
write!(
|
||||
f,
|
||||
"UploadMetadata(lsn: {})",
|
||||
uploaded.metadata.disk_consistent_lsn()
|
||||
)
|
||||
}
|
||||
UploadOp::Delete(delete) => {
|
||||
write!(f, "Delete({} layers)", delete.layers.len())
|
||||
|
||||
@@ -234,6 +234,7 @@ impl WalIngest {
|
||||
modification,
|
||||
&parsed_xact,
|
||||
info == pg_constants::XLOG_XACT_COMMIT,
|
||||
decoded.origin_id,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -246,6 +247,7 @@ impl WalIngest {
|
||||
modification,
|
||||
&parsed_xact,
|
||||
info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
|
||||
decoded.origin_id,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -375,6 +377,18 @@ impl WalIngest {
|
||||
self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
|
||||
}
|
||||
}
|
||||
pg_constants::RM_REPLORIGIN_ID => {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_REPLORIGIN_SET {
|
||||
let xlrec = crate::walrecord::XlReploriginSet::decode(&mut buf);
|
||||
modification
|
||||
.set_replorigin(xlrec.node_id, xlrec.remote_lsn)
|
||||
.await?
|
||||
} else if info == pg_constants::XLOG_REPLORIGIN_DROP {
|
||||
let xlrec = crate::walrecord::XlReploriginDrop::decode(&mut buf);
|
||||
modification.drop_replorigin(xlrec.node_id).await?
|
||||
}
|
||||
}
|
||||
_x => {
|
||||
// TODO: should probably log & fail here instead of blindly
|
||||
// doing something without understanding the protocol
|
||||
@@ -1178,6 +1192,7 @@ impl WalIngest {
|
||||
modification: &mut DatadirModification<'_>,
|
||||
parsed: &XlXactParsedRecord,
|
||||
is_commit: bool,
|
||||
origin_id: u16,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Record update of CLOG pages
|
||||
@@ -1243,6 +1258,11 @@ impl WalIngest {
|
||||
}
|
||||
}
|
||||
}
|
||||
if origin_id != 0 {
|
||||
modification
|
||||
.set_replorigin(origin_id, parsed.origin_lsn)
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -9,10 +9,10 @@ use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{BlockNumber, TimestampTz};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
|
||||
use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
|
||||
use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::{bin_ser::DeserializeError, lsn::Lsn};
|
||||
|
||||
/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
|
||||
/// around a PostgreSQL WAL record, or a custom neon-specific "record".
|
||||
@@ -116,6 +116,7 @@ pub struct DecodedWALRecord {
|
||||
|
||||
pub blocks: Vec<DecodedBkpBlock>,
|
||||
pub main_data_offset: usize,
|
||||
pub origin_id: u16,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
@@ -573,6 +574,7 @@ pub struct XlXactParsedRecord {
|
||||
pub subxacts: Vec<TransactionId>,
|
||||
|
||||
pub xnodes: Vec<RelFileNode>,
|
||||
pub origin_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl XlXactParsedRecord {
|
||||
@@ -651,6 +653,11 @@ impl XlXactParsedRecord {
|
||||
debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
|
||||
}
|
||||
|
||||
let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 {
|
||||
Lsn(buf.get_u64_le())
|
||||
} else {
|
||||
Lsn::INVALID
|
||||
};
|
||||
XlXactParsedRecord {
|
||||
xid,
|
||||
info,
|
||||
@@ -660,6 +667,7 @@ impl XlXactParsedRecord {
|
||||
ts_id,
|
||||
subxacts,
|
||||
xnodes,
|
||||
origin_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -810,6 +818,36 @@ impl XlRunningXacts {
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlReploriginDrop {
|
||||
pub node_id: RepOriginId,
|
||||
}
|
||||
|
||||
impl XlReploriginDrop {
|
||||
pub fn decode(buf: &mut Bytes) -> XlReploriginDrop {
|
||||
XlReploriginDrop {
|
||||
node_id: buf.get_u16_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlReploriginSet {
|
||||
pub remote_lsn: Lsn,
|
||||
pub node_id: RepOriginId,
|
||||
}
|
||||
|
||||
impl XlReploriginSet {
|
||||
pub fn decode(buf: &mut Bytes) -> XlReploriginSet {
|
||||
XlReploriginSet {
|
||||
remote_lsn: Lsn(buf.get_u64_le()),
|
||||
node_id: buf.get_u16_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Main routine to decode a WAL record and figure out which blocks are modified
|
||||
//
|
||||
// See xlogrecord.h for details
|
||||
@@ -844,6 +882,7 @@ pub fn decode_wal_record(
|
||||
let mut rnode_dbnode: u32 = 0;
|
||||
let mut rnode_relnode: u32 = 0;
|
||||
let mut got_rnode = false;
|
||||
let mut origin_id: u16 = 0;
|
||||
|
||||
let mut buf = record.clone();
|
||||
|
||||
@@ -891,7 +930,7 @@ pub fn decode_wal_record(
|
||||
|
||||
pg_constants::XLR_BLOCK_ID_ORIGIN => {
|
||||
// RepOriginId is uint16
|
||||
buf.advance(2);
|
||||
origin_id = buf.get_u16_le();
|
||||
}
|
||||
|
||||
pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
|
||||
@@ -1088,6 +1127,7 @@ pub fn decode_wal_record(
|
||||
decoded.xl_info = xlogrec.xl_info;
|
||||
decoded.xl_rmid = xlogrec.xl_rmid;
|
||||
decoded.record = record;
|
||||
decoded.origin_id = origin_id;
|
||||
decoded.main_data_offset = main_data_offset;
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
|
||||
/// Process lifecycle and abstracction for the IPC protocol.
|
||||
mod process;
|
||||
pub use process::Kind as ProcessKind;
|
||||
|
||||
/// Code to apply [`NeonWalRecord`]s.
|
||||
pub(crate) mod apply_neon;
|
||||
@@ -34,7 +33,6 @@ use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::key_to_rel_block;
|
||||
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::sync::Arc;
|
||||
@@ -55,7 +53,7 @@ pub struct PostgresRedoManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
/// The current [`process::Process`] that is used by new redo requests.
|
||||
/// The current [`process::WalRedoProcess`] that is used by new redo requests.
|
||||
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
|
||||
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
|
||||
/// their process object; we use [`Arc::clone`] for that.
|
||||
@@ -67,7 +65,7 @@ pub struct PostgresRedoManager {
|
||||
/// still be using the old redo process. But, those other tasks will most likely
|
||||
/// encounter an error as well, and errors are an unexpected condition anyway.
|
||||
/// So, probably we could get rid of the `Arc` in the future.
|
||||
redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
|
||||
redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -208,30 +206,35 @@ impl PostgresRedoManager {
|
||||
) -> anyhow::Result<Bytes> {
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
|
||||
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
||||
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => Arc::clone(&guard),
|
||||
Err(permit) => {
|
||||
// don't hold poison_guard, the launch code can bail
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
|
||||
let proc: Arc<process::WalRedoProcess> =
|
||||
match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => Arc::clone(&guard),
|
||||
Err(permit) => {
|
||||
// don't hold poison_guard, the launch code can bail
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::WalRedoProcess::launch(
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
pg_version,
|
||||
)
|
||||
.context("launch walredo process")?,
|
||||
);
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process.set(Arc::clone(&proc), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
);
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process.set(Arc::clone(&proc), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
@@ -362,10 +365,10 @@ impl PostgresRedoManager {
|
||||
&self,
|
||||
key: Key,
|
||||
page: &mut BytesMut,
|
||||
_record_lsn: Lsn,
|
||||
record_lsn: Lsn,
|
||||
record: &NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
apply_neon::apply_in_neon(record, key, page)?;
|
||||
apply_neon::apply_in_neon(record, record_lsn, key, page)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
@@ -14,6 +14,7 @@ use postgres_ffi::v14::nonrelfile_utils::{
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use tracing::*;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Can this request be served by neon redo functions
|
||||
/// or we need to pass it to wal-redo postgres process?
|
||||
@@ -32,6 +33,7 @@ pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
|
||||
|
||||
pub(crate) fn apply_in_neon(
|
||||
record: &NeonWalRecord,
|
||||
lsn: Lsn,
|
||||
key: Key,
|
||||
page: &mut BytesMut,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
@@ -48,7 +50,7 @@ pub(crate) fn apply_in_neon(
|
||||
flags,
|
||||
} => {
|
||||
// sanity check that this is modifying the correct relation
|
||||
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
||||
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
|
||||
assert!(
|
||||
rel.forknum == VISIBILITYMAP_FORKNUM,
|
||||
"ClearVisibilityMapFlags record on unexpected rel {}",
|
||||
@@ -67,6 +69,7 @@ pub(crate) fn apply_in_neon(
|
||||
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
|
||||
|
||||
map[map_byte as usize] &= !(flags << map_offset);
|
||||
postgres_ffi::page_set_lsn(page, lsn);
|
||||
}
|
||||
|
||||
// Repeat for 'old_heap_blkno', if any
|
||||
@@ -80,12 +83,13 @@ pub(crate) fn apply_in_neon(
|
||||
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
|
||||
|
||||
map[map_byte as usize] &= !(flags << map_offset);
|
||||
postgres_ffi::page_set_lsn(page, lsn);
|
||||
}
|
||||
}
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
@@ -130,7 +134,7 @@ pub(crate) fn apply_in_neon(
|
||||
}
|
||||
}
|
||||
NeonWalRecord::ClogSetAborted { xids } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
@@ -160,7 +164,7 @@ pub(crate) fn apply_in_neon(
|
||||
}
|
||||
}
|
||||
NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactOffsets,
|
||||
@@ -192,7 +196,7 @@ pub(crate) fn apply_in_neon(
|
||||
LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
|
||||
}
|
||||
NeonWalRecord::MultixactMembersCreate { moff, members } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactMembers,
|
||||
@@ -285,7 +289,7 @@ mod test {
|
||||
let mut page = BytesMut::from_iter(base_image);
|
||||
|
||||
for record in deltas {
|
||||
apply_in_neon(&record, file_path, &mut page)?;
|
||||
apply_in_neon(&record, Lsn(8), file_path, &mut page)?;
|
||||
}
|
||||
|
||||
let reconstructed = AuxFilesDirectory::des(&page)?;
|
||||
|
||||
@@ -1,64 +1,187 @@
|
||||
/// Layer of indirection previously used to support multiple implementations.
|
||||
/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
|
||||
use std::time::Duration;
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use tracing::warn;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||
|
||||
mod no_leak_child;
|
||||
/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
|
||||
mod protocol;
|
||||
|
||||
mod process_impl {
|
||||
pub(super) mod process_async;
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
process::{Command, Stdio},
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, poison::Poison};
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
#[cfg(feature = "testing")]
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
|
||||
stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Clone,
|
||||
Copy,
|
||||
Debug,
|
||||
PartialEq,
|
||||
Eq,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
strum_macros::IntoStaticStr,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
#[repr(u8)]
|
||||
pub enum Kind {
|
||||
Sync,
|
||||
Async,
|
||||
struct ProcessInput {
|
||||
stdin: tokio::process::ChildStdin,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
pub(crate) struct Process(process_impl::process_async::WalRedoProcess);
|
||||
struct ProcessOutput {
|
||||
stdout: tokio::process::ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
}
|
||||
|
||||
impl Process {
|
||||
#[inline(always)]
|
||||
pub fn launch(
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
if conf.walredo_process_kind != Kind::Async {
|
||||
warn!(
|
||||
configured = %conf.walredo_process_kind,
|
||||
"the walredo_process_kind setting has been turned into a no-op, using async implementation"
|
||||
);
|
||||
}
|
||||
Ok(Self(process_impl::process_async::WalRedoProcess::launch(
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
let stdin =
|
||||
tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
|
||||
let stdout = tokio::process::ChildStdout::from_std(stdout)
|
||||
.context("convert to tokio::ChildStdout")?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
#[cfg(feature = "testing")]
|
||||
tenant_shard_id,
|
||||
pg_version,
|
||||
)?))
|
||||
child: Some(child),
|
||||
stdin: tokio::sync::Mutex::new(Poison::new(
|
||||
"stdin",
|
||||
ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
},
|
||||
)),
|
||||
stdout: tokio::sync::Mutex::new(Poison::new(
|
||||
"stdout",
|
||||
ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
},
|
||||
)),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
/// Apply given WAL records ('records') over an old page image. Returns
|
||||
/// new page image.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Cancellation safe.
|
||||
#[instrument(skip_all, fields(pid=%self.id()))]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
@@ -67,12 +190,193 @@ impl Process {
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
self.0
|
||||
.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let Ok(res) =
|
||||
tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
|
||||
else {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
};
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.0.id()
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// When not polled to completion (e.g. because in `tokio::select!` another
|
||||
/// branch becomes ready before this future), concurrent and subsequent
|
||||
/// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
|
||||
/// Dispose of this process instance and create a new one.
|
||||
async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
|
||||
let request_no = {
|
||||
let mut lock_guard = self.stdin.lock().await;
|
||||
let mut poison_guard = lock_guard.check_and_arm()?;
|
||||
let input = poison_guard.data_mut();
|
||||
input
|
||||
.stdin
|
||||
.write_all(writebuf)
|
||||
.await
|
||||
.context("write to walredo stdin")?;
|
||||
let request_no = input.n_requests;
|
||||
input.n_requests += 1;
|
||||
poison_guard.disarm();
|
||||
request_no
|
||||
};
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut lock_guard = self.stdout.lock().await;
|
||||
let mut poison_guard = lock_guard.check_and_arm()?;
|
||||
let output = poison_guard.data_mut();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
output
|
||||
.stdout
|
||||
.read_exact(&mut resultbuf)
|
||||
.await
|
||||
.context("read walredo stdout")?;
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
poison_guard.disarm();
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
use std::io::Write;
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,374 +0,0 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::process::{no_leak_child, protocol},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
process::{Command, Stdio},
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, poison::Poison};
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
|
||||
stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: tokio::process::ChildStdin,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: tokio::process::ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
let stdin =
|
||||
tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
|
||||
let stdout = tokio::process::ChildStdout::from_std(stdout)
|
||||
.context("convert to tokio::ChildStdout")?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: tokio::sync::Mutex::new(Poison::new(
|
||||
"stdin",
|
||||
ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
},
|
||||
)),
|
||||
stdout: tokio::sync::Mutex::new(Poison::new(
|
||||
"stdout",
|
||||
ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
},
|
||||
)),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
/// Apply given WAL records ('records') over an old page image. Returns
|
||||
/// new page image.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Cancellation safe.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
base_img: &Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let Ok(res) =
|
||||
tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
|
||||
else {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
};
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// When not polled to completion (e.g. because in `tokio::select!` another
|
||||
/// branch becomes ready before this future), concurrent and subsequent
|
||||
/// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
|
||||
/// Dispose of this process instance and create a new one.
|
||||
async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
|
||||
let request_no = {
|
||||
let mut lock_guard = self.stdin.lock().await;
|
||||
let mut poison_guard = lock_guard.check_and_arm()?;
|
||||
let input = poison_guard.data_mut();
|
||||
input
|
||||
.stdin
|
||||
.write_all(writebuf)
|
||||
.await
|
||||
.context("write to walredo stdin")?;
|
||||
let request_no = input.n_requests;
|
||||
input.n_requests += 1;
|
||||
poison_guard.disarm();
|
||||
request_no
|
||||
};
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut lock_guard = self.stdout.lock().await;
|
||||
let mut poison_guard = lock_guard.check_and_arm()?;
|
||||
let output = poison_guard.data_mut();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
output
|
||||
.stdout
|
||||
.read_exact(&mut resultbuf)
|
||||
.await
|
||||
.context("read walredo stdout")?;
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
poison_guard.disarm();
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
use std::io::Write;
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
223
patches/pg_anon.patch
Normal file
223
patches/pg_anon.patch
Normal file
@@ -0,0 +1,223 @@
|
||||
commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
|
||||
Author: Alexey Masterov <alexeymasterov@neon.tech>
|
||||
Date: Fri May 31 06:34:26 2024 +0000
|
||||
|
||||
These alternative expected files were added to consider the neon features
|
||||
|
||||
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
|
||||
new file mode 100644
|
||||
index 0000000..2539cfd
|
||||
--- /dev/null
|
||||
+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
|
||||
@@ -0,0 +1,101 @@
|
||||
+BEGIN;
|
||||
+CREATE EXTENSION anon CASCADE;
|
||||
+NOTICE: installing required extension "pgcrypto"
|
||||
+SELECT anon.init();
|
||||
+ init
|
||||
+------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+CREATE ROLE mallory_the_masked_user;
|
||||
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
|
||||
+CREATE TABLE t1(i INT);
|
||||
+ALTER TABLE t1 ADD COLUMN t TEXT;
|
||||
+SECURITY LABEL FOR anon ON COLUMN t1.t
|
||||
+IS 'MASKED WITH VALUE NULL';
|
||||
+INSERT INTO t1 VALUES (1,'test');
|
||||
+--
|
||||
+-- We're checking the owner's permissions
|
||||
+--
|
||||
+-- see
|
||||
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
|
||||
+--
|
||||
+SET ROLE mallory_the_masked_user;
|
||||
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
|
||||
+ ?column?
|
||||
+----------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+-- SHOULD FAIL
|
||||
+DO $$
|
||||
+BEGIN
|
||||
+ PERFORM anon.init();
|
||||
+ EXCEPTION WHEN insufficient_privilege
|
||||
+ THEN RAISE NOTICE 'insufficient_privilege';
|
||||
+END$$;
|
||||
+NOTICE: insufficient_privilege
|
||||
+-- SHOULD FAIL
|
||||
+DO $$
|
||||
+BEGIN
|
||||
+ PERFORM anon.anonymize_table('t1');
|
||||
+ EXCEPTION WHEN insufficient_privilege
|
||||
+ THEN RAISE NOTICE 'insufficient_privilege';
|
||||
+END$$;
|
||||
+NOTICE: insufficient_privilege
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_start_engine;
|
||||
+SELECT anon.start_dynamic_masking();
|
||||
+ERROR: Only supersusers can start the dynamic masking engine.
|
||||
+CONTEXT: PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
|
||||
+ROLLBACK TO fail_start_engine;
|
||||
+RESET ROLE;
|
||||
+SELECT anon.start_dynamic_masking();
|
||||
+ start_dynamic_masking
|
||||
+-----------------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+SET ROLE mallory_the_masked_user;
|
||||
+SELECT * FROM mask.t1;
|
||||
+ i | t
|
||||
+---+---
|
||||
+ 1 |
|
||||
+(1 row)
|
||||
+
|
||||
+-- SHOULD FAIL
|
||||
+DO $$
|
||||
+BEGIN
|
||||
+ SELECT * FROM public.t1;
|
||||
+ EXCEPTION WHEN insufficient_privilege
|
||||
+ THEN RAISE NOTICE 'insufficient_privilege';
|
||||
+END$$;
|
||||
+NOTICE: insufficient_privilege
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_stop_engine;
|
||||
+SELECT anon.stop_dynamic_masking();
|
||||
+ERROR: Only supersusers can stop the dynamic masking engine.
|
||||
+CONTEXT: PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
|
||||
+ROLLBACK TO fail_stop_engine;
|
||||
+RESET ROLE;
|
||||
+SELECT anon.stop_dynamic_masking();
|
||||
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
|
||||
+ stop_dynamic_masking
|
||||
+----------------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+SET ROLE mallory_the_masked_user;
|
||||
+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
|
||||
+ ?column?
|
||||
+----------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_seclabel_on_role;
|
||||
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
|
||||
+ERROR: permission denied
|
||||
+DETAIL: The current user must have the CREATEROLE attribute.
|
||||
+ROLLBACK TO fail_seclabel_on_role;
|
||||
+ROLLBACK;
|
||||
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
|
||||
new file mode 100644
|
||||
index 0000000..8b090fe
|
||||
--- /dev/null
|
||||
+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
|
||||
@@ -0,0 +1,104 @@
|
||||
+BEGIN;
|
||||
+CREATE EXTENSION anon CASCADE;
|
||||
+NOTICE: installing required extension "pgcrypto"
|
||||
+SELECT anon.init();
|
||||
+ init
|
||||
+------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+CREATE ROLE oscar_the_owner;
|
||||
+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
|
||||
+CREATE ROLE mallory_the_masked_user;
|
||||
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
|
||||
+--
|
||||
+-- We're checking the owner's permissions
|
||||
+--
|
||||
+-- see
|
||||
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
|
||||
+--
|
||||
+SET ROLE oscar_the_owner;
|
||||
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
|
||||
+ ?column?
|
||||
+----------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+-- SHOULD FAIL
|
||||
+DO $$
|
||||
+BEGIN
|
||||
+ PERFORM anon.init();
|
||||
+ EXCEPTION WHEN insufficient_privilege
|
||||
+ THEN RAISE NOTICE 'insufficient_privilege';
|
||||
+END$$;
|
||||
+NOTICE: insufficient_privilege
|
||||
+CREATE TABLE t1(i INT);
|
||||
+ALTER TABLE t1 ADD COLUMN t TEXT;
|
||||
+SECURITY LABEL FOR anon ON COLUMN t1.t
|
||||
+IS 'MASKED WITH VALUE NULL';
|
||||
+INSERT INTO t1 VALUES (1,'test');
|
||||
+SELECT anon.anonymize_table('t1');
|
||||
+ anonymize_table
|
||||
+-----------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+SELECT * FROM t1;
|
||||
+ i | t
|
||||
+---+---
|
||||
+ 1 |
|
||||
+(1 row)
|
||||
+
|
||||
+UPDATE t1 SET t='test' WHERE i=1;
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_start_engine;
|
||||
+SELECT anon.start_dynamic_masking();
|
||||
+ start_dynamic_masking
|
||||
+-----------------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+ROLLBACK TO fail_start_engine;
|
||||
+RESET ROLE;
|
||||
+SELECT anon.start_dynamic_masking();
|
||||
+ start_dynamic_masking
|
||||
+-----------------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+SET ROLE oscar_the_owner;
|
||||
+SELECT * FROM t1;
|
||||
+ i | t
|
||||
+---+------
|
||||
+ 1 | test
|
||||
+(1 row)
|
||||
+
|
||||
+--SELECT * FROM mask.t1;
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_stop_engine;
|
||||
+SELECT anon.stop_dynamic_masking();
|
||||
+ERROR: permission denied for schema mask
|
||||
+CONTEXT: SQL statement "DROP VIEW mask.t1;"
|
||||
+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
|
||||
+SQL statement "SELECT anon.mask_drop_view(oid)
|
||||
+ FROM pg_catalog.pg_class
|
||||
+ WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
|
||||
+ AND relkind IN ('r','p','f')"
|
||||
+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
|
||||
+ROLLBACK TO fail_stop_engine;
|
||||
+RESET ROLE;
|
||||
+SELECT anon.stop_dynamic_masking();
|
||||
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
|
||||
+ stop_dynamic_masking
|
||||
+----------------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
+SET ROLE oscar_the_owner;
|
||||
+-- SHOULD FAIL
|
||||
+SAVEPOINT fail_seclabel_on_role;
|
||||
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
|
||||
+ERROR: permission denied
|
||||
+DETAIL: The current user must have the CREATEROLE attribute.
|
||||
+ROLLBACK TO fail_seclabel_on_role;
|
||||
+ROLLBACK;
|
||||
19
patches/pg_cron.patch
Normal file
19
patches/pg_cron.patch
Normal file
@@ -0,0 +1,19 @@
|
||||
commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master)
|
||||
Author: Alexey Masterov <alexeymasterov@neon.tech>
|
||||
Date: Fri Jun 7 19:23:42 2024 +0000
|
||||
|
||||
Disable REGRESS_OPTIONS causing initdb
|
||||
|
||||
diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile
|
||||
index 053314c..fbd5fb5 100644
|
||||
--- a/ext-src/pg_cron-src/Makefile
|
||||
+++ b/ext-src/pg_cron-src/Makefile
|
||||
@@ -5,7 +5,7 @@ EXTENSION = pg_cron
|
||||
DATA_built = $(EXTENSION)--1.0.sql
|
||||
DATA = $(wildcard $(EXTENSION)--*--*.sql)
|
||||
|
||||
-REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
|
||||
+#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
|
||||
REGRESS = pg_cron-test
|
||||
|
||||
# compilation configuration
|
||||
39
patches/pg_hintplan.patch
Normal file
39
patches/pg_hintplan.patch
Normal file
@@ -0,0 +1,39 @@
|
||||
commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
|
||||
Author: Alexey Masterov <alexeymasterov@neon.tech>
|
||||
Date: Thu Jun 6 08:02:42 2024 +0000
|
||||
|
||||
Patch expected files to consider Neon's log messages
|
||||
|
||||
diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
index da723b8..f8d0102 100644
|
||||
--- a/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
@@ -9,13 +9,16 @@ SET search_path TO public;
|
||||
----
|
||||
-- No.A-1-1-3
|
||||
CREATE EXTENSION pg_hint_plan;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
-- No.A-1-2-3
|
||||
DROP EXTENSION pg_hint_plan;
|
||||
-- No.A-1-1-4
|
||||
CREATE SCHEMA other_schema;
|
||||
CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan"
|
||||
CREATE EXTENSION pg_hint_plan;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
DROP SCHEMA other_schema;
|
||||
----
|
||||
---- No. A-5-1 comment pattern
|
||||
diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
index d372459..6282afe 100644
|
||||
--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
|
||||
SET client_min_messages TO LOG;
|
||||
SET pg_hint_plan.enable_hint TO on;
|
||||
CREATE EXTENSION file_fdw;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
|
||||
CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
|
||||
CREATE USER MAPPING FOR PUBLIC SERVER file_server;
|
||||
CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
|
||||
@@ -605,7 +605,7 @@ prefetch_read(PrefetchRequest *slot)
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_shard_log(slot->shard_no, WARNING,
|
||||
neon_shard_log(slot->shard_no, LOG,
|
||||
"No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
|
||||
(long)slot->my_ring_index,
|
||||
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
|
||||
|
||||
@@ -452,7 +452,7 @@ pub struct ApiLocks<K> {
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ApiLockError {
|
||||
#[error("permit could not be acquired")]
|
||||
#[error("timeout acquiring resource permit")]
|
||||
TimeoutError(#[from] tokio::time::error::Elapsed),
|
||||
}
|
||||
|
||||
@@ -504,7 +504,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
.clone()
|
||||
}
|
||||
};
|
||||
let permit = semaphore.acquire_deadline(now + self.timeout).await;
|
||||
let permit = semaphore.acquire_timeout(self.timeout).await;
|
||||
|
||||
self.metrics
|
||||
.semaphore_acquire_seconds
|
||||
|
||||
@@ -3,7 +3,7 @@ use parking_lot::Mutex;
|
||||
use std::{pin::pin, sync::Arc, time::Duration};
|
||||
use tokio::{
|
||||
sync::Notify,
|
||||
time::{error::Elapsed, timeout_at, Instant},
|
||||
time::{error::Elapsed, Instant},
|
||||
};
|
||||
|
||||
use self::aimd::Aimd;
|
||||
@@ -80,7 +80,7 @@ pub struct LimiterInner {
|
||||
}
|
||||
|
||||
impl LimiterInner {
|
||||
fn update(&mut self, latency: Duration, outcome: Option<Outcome>) {
|
||||
fn update_limit(&mut self, latency: Duration, outcome: Option<Outcome>) {
|
||||
if let Some(outcome) = outcome {
|
||||
let sample = Sample {
|
||||
latency,
|
||||
@@ -92,12 +92,12 @@ impl LimiterInner {
|
||||
}
|
||||
|
||||
fn take(&mut self, ready: &Notify) -> Option<()> {
|
||||
if self.available > 1 {
|
||||
if self.available >= 1 {
|
||||
self.available -= 1;
|
||||
self.in_flight += 1;
|
||||
|
||||
// tell the next in the queue that there is a permit ready
|
||||
if self.available > 1 {
|
||||
if self.available >= 1 {
|
||||
ready.notify_one();
|
||||
}
|
||||
Some(())
|
||||
@@ -157,16 +157,12 @@ impl DynamicLimiter {
|
||||
}
|
||||
|
||||
/// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
|
||||
///
|
||||
/// Returns `None` if there are none available after `duration`.
|
||||
pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
|
||||
self.acquire_deadline(Instant::now() + duration).await
|
||||
tokio::time::timeout(duration, self.acquire()).await?
|
||||
}
|
||||
|
||||
/// Try to acquire a concurrency [Token], waiting until `deadline` if there are none available.
|
||||
///
|
||||
/// Returns `None` if there are none available after `deadline`.
|
||||
pub async fn acquire_deadline(self: &Arc<Self>, deadline: Instant) -> Result<Token, Elapsed> {
|
||||
/// Try to acquire a concurrency [Token].
|
||||
async fn acquire(self: &Arc<Self>) -> Result<Token, Elapsed> {
|
||||
if self.config.initial_limit == 0 {
|
||||
// If the rate limiter is disabled, we can always acquire a token.
|
||||
Ok(Token::disabled())
|
||||
@@ -174,22 +170,16 @@ impl DynamicLimiter {
|
||||
let mut notified = pin!(self.ready.notified());
|
||||
let mut ready = notified.as_mut().enable();
|
||||
loop {
|
||||
let mut limit = None;
|
||||
if ready {
|
||||
let mut inner = self.inner.lock();
|
||||
if inner.take(&self.ready).is_some() {
|
||||
break Ok(Token::new(self.clone()));
|
||||
}
|
||||
limit = Some(inner.limit);
|
||||
}
|
||||
match timeout_at(deadline, notified.as_mut()).await {
|
||||
Ok(()) => ready = true,
|
||||
Err(e) => {
|
||||
let limit = limit.unwrap_or_else(|| self.inner.lock().limit);
|
||||
tracing::info!(limit, "could not acquire token in time");
|
||||
break Err(e);
|
||||
} else {
|
||||
notified.set(self.ready.notified());
|
||||
}
|
||||
}
|
||||
notified.as_mut().await;
|
||||
ready = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -208,14 +198,14 @@ impl DynamicLimiter {
|
||||
|
||||
let mut inner = self.inner.lock();
|
||||
|
||||
inner.update(start.elapsed(), outcome);
|
||||
inner.update_limit(start.elapsed(), outcome);
|
||||
|
||||
inner.in_flight -= 1;
|
||||
if inner.in_flight < inner.limit {
|
||||
inner.available = inner.limit - inner.in_flight;
|
||||
// At least 1 permit is now available
|
||||
self.ready.notify_one();
|
||||
}
|
||||
|
||||
inner.in_flight -= 1;
|
||||
}
|
||||
|
||||
/// The current state of the limiter.
|
||||
|
||||
@@ -51,7 +51,9 @@ impl LimitAlgorithm for Aimd {
|
||||
// E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
|
||||
let limit = limit.floor() as usize;
|
||||
|
||||
limit.clamp(self.min, self.max)
|
||||
let limit = limit.clamp(self.min, self.max);
|
||||
tracing::info!(limit, "limit decreased");
|
||||
limit
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -67,6 +69,53 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn increase_decrease() {
|
||||
let config = RateLimiterConfig {
|
||||
initial_limit: 1,
|
||||
algorithm: RateLimitAlgorithm::Aimd {
|
||||
conf: Aimd {
|
||||
min: 1,
|
||||
max: 2,
|
||||
inc: 10,
|
||||
dec: 0.5,
|
||||
utilisation: 0.8,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
let limiter = DynamicLimiter::new(config);
|
||||
|
||||
let token = limiter
|
||||
.acquire_timeout(Duration::from_millis(1))
|
||||
.await
|
||||
.unwrap();
|
||||
token.release(Outcome::Success);
|
||||
|
||||
assert_eq!(limiter.state().limit(), 2);
|
||||
|
||||
let token = limiter
|
||||
.acquire_timeout(Duration::from_millis(1))
|
||||
.await
|
||||
.unwrap();
|
||||
token.release(Outcome::Success);
|
||||
assert_eq!(limiter.state().limit(), 2);
|
||||
|
||||
let token = limiter
|
||||
.acquire_timeout(Duration::from_millis(1))
|
||||
.await
|
||||
.unwrap();
|
||||
token.release(Outcome::Overload);
|
||||
assert_eq!(limiter.state().limit(), 1);
|
||||
|
||||
let token = limiter
|
||||
.acquire_timeout(Duration::from_millis(1))
|
||||
.await
|
||||
.unwrap();
|
||||
token.release(Outcome::Overload);
|
||||
assert_eq!(limiter.state().limit(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn should_decrease_limit_on_overload() {
|
||||
let config = RateLimiterConfig {
|
||||
@@ -85,7 +134,7 @@ mod tests {
|
||||
let limiter = DynamicLimiter::new(config);
|
||||
|
||||
let token = limiter
|
||||
.acquire_timeout(Duration::from_millis(1))
|
||||
.acquire_timeout(Duration::from_millis(100))
|
||||
.await
|
||||
.unwrap();
|
||||
token.release(Outcome::Overload);
|
||||
@@ -93,6 +142,41 @@ mod tests {
|
||||
assert_eq!(limiter.state().limit(), 5, "overload: decrease");
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn acquire_timeout_times_out() {
|
||||
let config = RateLimiterConfig {
|
||||
initial_limit: 1,
|
||||
algorithm: RateLimitAlgorithm::Aimd {
|
||||
conf: Aimd {
|
||||
min: 1,
|
||||
max: 2,
|
||||
inc: 10,
|
||||
dec: 0.5,
|
||||
utilisation: 0.8,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
let limiter = DynamicLimiter::new(config);
|
||||
|
||||
let token = limiter
|
||||
.acquire_timeout(Duration::from_millis(1))
|
||||
.await
|
||||
.unwrap();
|
||||
let now = tokio::time::Instant::now();
|
||||
limiter
|
||||
.acquire_timeout(Duration::from_secs(1))
|
||||
.await
|
||||
.err()
|
||||
.unwrap();
|
||||
|
||||
assert!(now.elapsed() >= Duration::from_secs(1));
|
||||
|
||||
token.release(Outcome::Success);
|
||||
|
||||
assert_eq!(limiter.state().limit(), 2);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
|
||||
let config = RateLimiterConfig {
|
||||
|
||||
@@ -45,6 +45,10 @@ pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
|
||||
err.to_string(),
|
||||
StatusCode::REQUEST_TIMEOUT,
|
||||
),
|
||||
ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
|
||||
this.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
|
||||
@@ -11,6 +11,7 @@ either.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
anyhow.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
thiserror.workspace = true
|
||||
rand.workspace = true
|
||||
bytes.workspace = true
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_sdk_s3::{types::ObjectIdentifier, Client};
|
||||
use aws_sdk_s3::Client;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use tracing::{error, info, warn};
|
||||
@@ -70,7 +70,7 @@ pub(crate) fn branch_cleanup_and_check_errors(
|
||||
|
||||
match s3_data {
|
||||
Some(s3_data) => {
|
||||
result.garbage_keys.extend(s3_data.keys_to_remove);
|
||||
result.garbage_keys.extend(s3_data.unknown_keys);
|
||||
|
||||
match s3_data.blob_data {
|
||||
BlobDataParseResult::Parsed {
|
||||
@@ -78,27 +78,26 @@ pub(crate) fn branch_cleanup_and_check_errors(
|
||||
index_part_generation: _index_part_generation,
|
||||
s3_layers: _s3_layers,
|
||||
} => {
|
||||
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
|
||||
result.errors.push(format!(
|
||||
"index_part.json version: {}",
|
||||
index_part.get_version()
|
||||
))
|
||||
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) {
|
||||
result
|
||||
.errors
|
||||
.push(format!("index_part.json version: {}", index_part.version()))
|
||||
}
|
||||
|
||||
if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
|
||||
if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
|
||||
result.warnings.push(format!(
|
||||
"index_part.json version is not latest: {}",
|
||||
index_part.get_version()
|
||||
index_part.version()
|
||||
))
|
||||
}
|
||||
|
||||
if index_part.metadata.disk_consistent_lsn()
|
||||
!= index_part.get_disk_consistent_lsn()
|
||||
!= index_part.duplicated_disk_consistent_lsn()
|
||||
{
|
||||
result.errors.push(format!(
|
||||
"Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})",
|
||||
index_part.metadata.disk_consistent_lsn(),
|
||||
index_part.get_disk_consistent_lsn(),
|
||||
index_part.duplicated_disk_consistent_lsn(),
|
||||
))
|
||||
}
|
||||
|
||||
@@ -240,7 +239,12 @@ impl TenantObjectListing {
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct S3TimelineBlobData {
|
||||
pub(crate) blob_data: BlobDataParseResult,
|
||||
pub(crate) keys_to_remove: Vec<String>,
|
||||
|
||||
// Index objects that were not used when loading `blob_data`, e.g. those from old generations
|
||||
pub(crate) unused_index_keys: Vec<String>,
|
||||
|
||||
// Objects whose keys were not recognized at all, i.e. not layer files, not indices
|
||||
pub(crate) unknown_keys: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -276,12 +280,12 @@ pub(crate) async fn list_timeline_blobs(
|
||||
let mut s3_layers = HashSet::new();
|
||||
|
||||
let mut errors = Vec::new();
|
||||
let mut keys_to_remove = Vec::new();
|
||||
let mut unknown_keys = Vec::new();
|
||||
|
||||
let mut timeline_dir_target = s3_root.timeline_root(&id);
|
||||
timeline_dir_target.delimiter = String::new();
|
||||
|
||||
let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
|
||||
let mut index_part_keys: Vec<String> = Vec::new();
|
||||
let mut initdb_archive: bool = false;
|
||||
|
||||
let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
|
||||
@@ -292,16 +296,16 @@ pub(crate) async fn list_timeline_blobs(
|
||||
let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
|
||||
match blob_name {
|
||||
Some(name) if name.starts_with("index_part.json") => {
|
||||
tracing::info!("Index key {key}");
|
||||
index_parts.push(obj)
|
||||
tracing::debug!("Index key {key}");
|
||||
index_part_keys.push(key.to_owned())
|
||||
}
|
||||
Some("initdb.tar.zst") => {
|
||||
tracing::info!("initdb archive {key}");
|
||||
tracing::debug!("initdb archive {key}");
|
||||
initdb_archive = true;
|
||||
}
|
||||
Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
|
||||
Ok((new_layer, gen)) => {
|
||||
tracing::info!("Parsed layer key: {} {:?}", new_layer, gen);
|
||||
tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
|
||||
s3_layers.insert((new_layer, gen));
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -309,37 +313,37 @@ pub(crate) async fn list_timeline_blobs(
|
||||
errors.push(
|
||||
format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
|
||||
);
|
||||
keys_to_remove.push(key.to_string());
|
||||
unknown_keys.push(key.to_string());
|
||||
}
|
||||
},
|
||||
None => {
|
||||
tracing::info!("Peculiar key {}", key);
|
||||
tracing::warn!("Unknown key {}", key);
|
||||
errors.push(format!("S3 list response got an object with odd key {key}"));
|
||||
keys_to_remove.push(key.to_string());
|
||||
unknown_keys.push(key.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive {
|
||||
tracing::info!(
|
||||
if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive {
|
||||
tracing::debug!(
|
||||
"Timeline is empty apart from initdb archive: expected post-deletion state."
|
||||
);
|
||||
return Ok(S3TimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Relic,
|
||||
keys_to_remove: Vec::new(),
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys: Vec::new(),
|
||||
});
|
||||
}
|
||||
|
||||
// Choose the index_part with the highest generation
|
||||
let (index_part_object, index_part_generation) = match index_parts
|
||||
let (index_part_object, index_part_generation) = match index_part_keys
|
||||
.iter()
|
||||
.filter_map(|k| {
|
||||
let key = k.key();
|
||||
.filter_map(|key| {
|
||||
// Stripping the index key to the last part, because RemotePath doesn't
|
||||
// like absolute paths, and depending on prefix_in_bucket it's possible
|
||||
// for the keys we read back to start with a slash.
|
||||
let basename = key.rsplit_once('/').unwrap().1;
|
||||
parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g))
|
||||
parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
|
||||
})
|
||||
.max_by_key(|i| i.1)
|
||||
.map(|(k, g)| (k.clone(), g))
|
||||
@@ -347,15 +351,18 @@ pub(crate) async fn list_timeline_blobs(
|
||||
Some((key, gen)) => (Some(key), gen),
|
||||
None => {
|
||||
// Legacy/missing case: one or zero index parts, which did not have a generation
|
||||
(index_parts.pop(), Generation::none())
|
||||
(index_part_keys.pop(), Generation::none())
|
||||
}
|
||||
};
|
||||
|
||||
if index_part_object.is_none() {
|
||||
errors.push("S3 list response got no index_part.json file".to_string());
|
||||
match index_part_object.as_ref() {
|
||||
Some(selected) => index_part_keys.retain(|k| k != selected),
|
||||
None => {
|
||||
errors.push("S3 list response got no index_part.json file".to_string());
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(index_part_object_key) = index_part_object.as_ref().map(|object| object.key()) {
|
||||
if let Some(index_part_object_key) = index_part_object.as_ref() {
|
||||
let index_part_bytes = download_object_with_retries(
|
||||
s3_client,
|
||||
&timeline_dir_target.bucket_name,
|
||||
@@ -372,17 +379,14 @@ pub(crate) async fn list_timeline_blobs(
|
||||
index_part_generation,
|
||||
s3_layers,
|
||||
},
|
||||
keys_to_remove,
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys,
|
||||
})
|
||||
}
|
||||
Err(index_parse_error) => errors.push(format!(
|
||||
"index_part.json body parsing error: {index_parse_error}"
|
||||
)),
|
||||
}
|
||||
} else {
|
||||
errors.push(format!(
|
||||
"Index part object {index_part_object:?} has no key"
|
||||
));
|
||||
}
|
||||
|
||||
if errors.is_empty() {
|
||||
@@ -393,6 +397,7 @@ pub(crate) async fn list_timeline_blobs(
|
||||
|
||||
Ok(S3TimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Incorrect(errors),
|
||||
keys_to_remove,
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ pub mod checks;
|
||||
pub mod cloud_admin_api;
|
||||
pub mod garbage;
|
||||
pub mod metadata_stream;
|
||||
pub mod pageserver_physical_gc;
|
||||
pub mod scan_pageserver_metadata;
|
||||
pub mod scan_safekeeper_metadata;
|
||||
pub mod tenant_snapshot;
|
||||
@@ -396,7 +397,7 @@ async fn download_object_with_retries(
|
||||
.await
|
||||
{
|
||||
Ok(bytes_read) => {
|
||||
tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}");
|
||||
tracing::debug!("Downloaded {bytes_read} bytes for object {key}");
|
||||
return Ok(body_buf);
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -2,11 +2,13 @@ use anyhow::bail;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||
use s3_scrubber::pageserver_physical_gc::GcMode;
|
||||
use s3_scrubber::scan_pageserver_metadata::scan_metadata;
|
||||
use s3_scrubber::tenant_snapshot::SnapshotDownloader;
|
||||
use s3_scrubber::{
|
||||
init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
|
||||
NodeKind, TraversingDepth,
|
||||
init_logging, pageserver_physical_gc::pageserver_physical_gc,
|
||||
scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
|
||||
TraversingDepth,
|
||||
};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
@@ -62,6 +64,14 @@ enum Command {
|
||||
#[arg(short, long)]
|
||||
output_path: Utf8PathBuf,
|
||||
},
|
||||
PageserverPhysicalGc {
|
||||
#[arg(long = "tenant-id", num_args = 0..)]
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
#[arg(long = "min-age")]
|
||||
min_age: humantime::Duration,
|
||||
#[arg(short, long, default_value_t = GcMode::IndicesOnly)]
|
||||
mode: GcMode,
|
||||
},
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -75,6 +85,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
Command::FindGarbage { .. } => "find-garbage",
|
||||
Command::PurgeGarbage { .. } => "purge-garbage",
|
||||
Command::TenantSnapshot { .. } => "tenant-snapshot",
|
||||
Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
|
||||
};
|
||||
let _guard = init_logging(&format!(
|
||||
"{}_{}_{}_{}.log",
|
||||
@@ -178,5 +189,15 @@ async fn main() -> anyhow::Result<()> {
|
||||
SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
|
||||
downloader.download().await
|
||||
}
|
||||
Command::PageserverPhysicalGc {
|
||||
tenant_ids,
|
||||
min_age,
|
||||
mode,
|
||||
} => {
|
||||
let summary =
|
||||
pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?;
|
||||
println!("{}", serde_json::to_string(&summary).unwrap());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
239
s3_scrubber/src/pageserver_physical_gc.rs
Normal file
239
s3_scrubber/src/pageserver_physical_gc.rs
Normal file
@@ -0,0 +1,239 @@
|
||||
use std::time::{Duration, UNIX_EPOCH};
|
||||
|
||||
use crate::checks::{list_timeline_blobs, BlobDataParseResult};
|
||||
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
|
||||
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
|
||||
use aws_sdk_s3::Client;
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::RemotePath;
|
||||
use serde::Serialize;
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::generation::Generation;
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct GcSummary {
|
||||
indices_deleted: usize,
|
||||
remote_storage_errors: usize,
|
||||
}
|
||||
|
||||
#[derive(clap::ValueEnum, Debug, Clone, Copy)]
|
||||
pub enum GcMode {
|
||||
// Delete nothing
|
||||
DryRun,
|
||||
|
||||
// Enable only removing old-generation indices
|
||||
IndicesOnly,
|
||||
// Enable all forms of GC
|
||||
// TODO: this will be used when shard split ancestor layer deletion is added
|
||||
// All,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for GcMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
GcMode::DryRun => write!(f, "dry-run"),
|
||||
GcMode::IndicesOnly => write!(f, "indices-only"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn maybe_delete_index(
|
||||
s3_client: &Client,
|
||||
bucket_config: &BucketConfig,
|
||||
min_age: &Duration,
|
||||
latest_gen: Generation,
|
||||
key: &str,
|
||||
mode: GcMode,
|
||||
summary: &mut GcSummary,
|
||||
) {
|
||||
// Validation: we will only delete things that parse cleanly
|
||||
let basename = key.rsplit_once('/').unwrap().1;
|
||||
let candidate_generation =
|
||||
match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) {
|
||||
Some(g) => g,
|
||||
None => {
|
||||
if basename == IndexPart::FILE_NAME {
|
||||
// A legacy pre-generation index
|
||||
Generation::none()
|
||||
} else {
|
||||
// A strange key: we will not delete this because we don't understand it.
|
||||
tracing::warn!("Bad index key");
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Validation: we will only delete indices more than one generation old, to avoid interfering
|
||||
// in typical migrations, even if they are very long running.
|
||||
if candidate_generation >= latest_gen {
|
||||
// This shouldn't happen: when we loaded metadata, it should have selected the latest
|
||||
// generation already, and only populated [`S3TimelineBlobData::unused_index_keys`]
|
||||
// with older generations.
|
||||
tracing::warn!("Deletion candidate is >= latest generation, this is a bug!");
|
||||
return;
|
||||
} else if candidate_generation.next() == latest_gen {
|
||||
// Skip deleting the latest-1th generation's index.
|
||||
return;
|
||||
}
|
||||
|
||||
// Validation: we will only delete indices after one week, so that during incidents we will have
|
||||
// easy access to recent indices.
|
||||
let age: Duration = match s3_client
|
||||
.head_object()
|
||||
.bucket(&bucket_config.bucket)
|
||||
.key(key)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(response) => match response.last_modified {
|
||||
None => {
|
||||
tracing::warn!("Missing last_modified");
|
||||
summary.remote_storage_errors += 1;
|
||||
return;
|
||||
}
|
||||
Some(last_modified) => {
|
||||
let last_modified =
|
||||
UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64());
|
||||
match last_modified.elapsed() {
|
||||
Ok(e) => e,
|
||||
Err(_) => {
|
||||
tracing::warn!("Bad last_modified time: {last_modified:?}");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to HEAD {key}: {e}");
|
||||
summary.remote_storage_errors += 1;
|
||||
return;
|
||||
}
|
||||
};
|
||||
if &age < min_age {
|
||||
tracing::info!(
|
||||
"Skipping young object {} < {}",
|
||||
age.as_secs_f64(),
|
||||
min_age.as_secs_f64()
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if matches!(mode, GcMode::DryRun) {
|
||||
tracing::info!("Dry run: would delete this key");
|
||||
return;
|
||||
}
|
||||
|
||||
// All validations passed: erase the object
|
||||
match s3_client
|
||||
.delete_object()
|
||||
.bucket(&bucket_config.bucket)
|
||||
.key(key)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
tracing::info!("Successfully deleted index");
|
||||
summary.indices_deleted += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to delete index: {e}");
|
||||
summary.remote_storage_errors += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection
|
||||
/// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection
|
||||
/// is about removing:
|
||||
/// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
|
||||
/// uploading a layer and uploading an index)
|
||||
/// - Index objects from historic generations
|
||||
///
|
||||
/// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
|
||||
/// make sure that object listings don't get slowed down by large numbers of garbage objects.
|
||||
pub async fn pageserver_physical_gc(
|
||||
bucket_config: BucketConfig,
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
min_age: Duration,
|
||||
mode: GcMode,
|
||||
) -> anyhow::Result<GcSummary> {
|
||||
let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
|
||||
|
||||
let tenants = if tenant_ids.is_empty() {
|
||||
futures::future::Either::Left(stream_tenants(&s3_client, &target))
|
||||
} else {
|
||||
futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
|
||||
};
|
||||
|
||||
// How many tenants to process in parallel. We need to be mindful of pageservers
|
||||
// accessing the same per tenant prefixes, so use a lower setting than pageservers.
|
||||
const CONCURRENCY: usize = 32;
|
||||
|
||||
// Generate a stream of TenantTimelineId
|
||||
let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
|
||||
let timelines = timelines.try_buffered(CONCURRENCY);
|
||||
let timelines = timelines.try_flatten();
|
||||
|
||||
// Generate a stream of S3TimelineBlobData
|
||||
async fn gc_timeline(
|
||||
s3_client: &Client,
|
||||
bucket_config: &BucketConfig,
|
||||
min_age: &Duration,
|
||||
target: &RootTarget,
|
||||
mode: GcMode,
|
||||
ttid: TenantShardTimelineId,
|
||||
) -> anyhow::Result<GcSummary> {
|
||||
let mut summary = GcSummary::default();
|
||||
let data = list_timeline_blobs(s3_client, ttid, target).await?;
|
||||
|
||||
let (latest_gen, candidates) = match &data.blob_data {
|
||||
BlobDataParseResult::Parsed {
|
||||
index_part: _index_part,
|
||||
index_part_generation,
|
||||
s3_layers: _s3_layers,
|
||||
} => (*index_part_generation, data.unused_index_keys),
|
||||
BlobDataParseResult::Relic => {
|
||||
// Post-deletion tenant location: don't try and GC it.
|
||||
return Ok(summary);
|
||||
}
|
||||
BlobDataParseResult::Incorrect(reasons) => {
|
||||
// Our primary purpose isn't to report on bad data, but log this rather than skipping silently
|
||||
tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
|
||||
return Ok(summary);
|
||||
}
|
||||
};
|
||||
|
||||
for key in candidates {
|
||||
maybe_delete_index(
|
||||
s3_client,
|
||||
bucket_config,
|
||||
min_age,
|
||||
latest_gen,
|
||||
&key,
|
||||
mode,
|
||||
&mut summary,
|
||||
)
|
||||
.instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, key))
|
||||
.await;
|
||||
}
|
||||
|
||||
Ok(summary)
|
||||
}
|
||||
let timelines = timelines
|
||||
.map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid));
|
||||
let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
|
||||
|
||||
let mut summary = GcSummary::default();
|
||||
|
||||
while let Some(i) = timelines.next().await {
|
||||
let tl_summary = i?;
|
||||
|
||||
summary.indices_deleted += tl_summary.indices_deleted;
|
||||
summary.remote_storage_errors += tl_summary.remote_storage_errors;
|
||||
}
|
||||
|
||||
Ok(summary)
|
||||
}
|
||||
@@ -125,7 +125,7 @@ impl MetadataSummary {
|
||||
{
|
||||
*self
|
||||
.indices_by_version
|
||||
.entry(index_part.get_version())
|
||||
.entry(index_part.version())
|
||||
.or_insert(0) += 1;
|
||||
|
||||
if let Err(e) = self.update_histograms(index_part) {
|
||||
|
||||
@@ -213,6 +213,9 @@ pub async fn main_task(
|
||||
}
|
||||
};
|
||||
|
||||
// remove timeline from the broker active set sooner, before waiting for background tasks
|
||||
tli_broker_active.set(false);
|
||||
|
||||
// shutdown background tasks
|
||||
if conf.is_wal_backup_enabled() {
|
||||
wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
|
||||
use rand::Rng;
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -276,13 +275,6 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
|
||||
debug!("started");
|
||||
let await_duration = conf.partial_backup_timeout;
|
||||
|
||||
// sleep for random time to avoid thundering herd
|
||||
{
|
||||
let randf64 = rand::thread_rng().gen_range(0.0..1.0);
|
||||
let sleep_duration = await_duration.mul_f64(randf64);
|
||||
tokio::time::sleep(sleep_duration).await;
|
||||
}
|
||||
|
||||
let (_, persistent_state) = tli.get_state().await;
|
||||
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
|
||||
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
|
||||
|
||||
@@ -40,6 +40,7 @@ tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
measured.workspace = true
|
||||
scopeguard.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
|
||||
57
storage_controller/src/background_node_operations.rs
Normal file
57
storage_controller/src/background_node_operations.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use std::{borrow::Cow, fmt::Debug, fmt::Display};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::NodeId;
|
||||
|
||||
pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 10;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) struct Drain {
|
||||
pub(crate) node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) struct Fill {
|
||||
pub(crate) node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) enum Operation {
|
||||
Drain(Drain),
|
||||
Fill(Fill),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum OperationError {
|
||||
#[error("Node state changed during operation: {0}")]
|
||||
NodeStateChanged(Cow<'static, str>),
|
||||
#[error("Operation cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
pub(crate) struct OperationHandler {
|
||||
pub(crate) operation: Operation,
|
||||
#[allow(unused)]
|
||||
pub(crate) cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl Display for Drain {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "drain {}", self.node_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Fill {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "fill {}", self.node_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Operation {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
Operation::Drain(op) => write!(f, "{op}"),
|
||||
Operation::Fill(op) => write!(f, "{op}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -142,52 +142,6 @@ async fn handle_tenant_create(
|
||||
)
|
||||
}
|
||||
|
||||
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
|
||||
// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. This avoids
|
||||
// needing to track a "deleting" state for tenants.
|
||||
async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
|
||||
where
|
||||
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
|
||||
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
let started_at = Instant::now();
|
||||
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
|
||||
// completed.
|
||||
let mut retry_period = Duration::from_secs(1);
|
||||
// On subsequent retries, wait longer.
|
||||
let max_retry_period = Duration::from_secs(5);
|
||||
// Enable callers with a 30 second request timeout to reliably get a response
|
||||
let max_wait = Duration::from_secs(25);
|
||||
|
||||
loop {
|
||||
let status = f(service.clone()).await?;
|
||||
match status {
|
||||
StatusCode::ACCEPTED => {
|
||||
tracing::info!("Deletion accepted, waiting to try again...");
|
||||
tokio::time::sleep(retry_period).await;
|
||||
retry_period = max_retry_period;
|
||||
}
|
||||
StatusCode::NOT_FOUND => {
|
||||
tracing::info!("Deletion complete");
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!("Unexpected status {status}");
|
||||
return json_response(status, ());
|
||||
}
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
if now + retry_period > started_at + max_wait {
|
||||
tracing::info!("Deletion timed out waiting for 404");
|
||||
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
|
||||
// the pageserver's swagger definition for this endpoint, and has the same desired
|
||||
// effect of causing the control plane to retry later.
|
||||
return json_response(StatusCode::CONFLICT, ());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_tenant_location_config(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
@@ -283,13 +237,17 @@ async fn handle_tenant_delete(
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
service
|
||||
.tenant_delete(tenant_id)
|
||||
.await
|
||||
.and_then(map_reqwest_hyper_status)
|
||||
})
|
||||
.await
|
||||
let status_code = service
|
||||
.tenant_delete(tenant_id)
|
||||
.await
|
||||
.and_then(map_reqwest_hyper_status)?;
|
||||
|
||||
if status_code == StatusCode::NOT_FOUND {
|
||||
// The pageserver uses 404 for successful deletion, but we use 200
|
||||
json_response(StatusCode::OK, ())
|
||||
} else {
|
||||
json_response(status_code, ())
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_create(
|
||||
@@ -317,6 +275,51 @@ async fn handle_tenant_timeline_delete(
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
|
||||
// For timeline deletions, which both implement an "initially return 202, then 404 once
|
||||
// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.
|
||||
async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
|
||||
where
|
||||
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
|
||||
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
let started_at = Instant::now();
|
||||
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
|
||||
// completed.
|
||||
let mut retry_period = Duration::from_secs(1);
|
||||
// On subsequent retries, wait longer.
|
||||
let max_retry_period = Duration::from_secs(5);
|
||||
// Enable callers with a 30 second request timeout to reliably get a response
|
||||
let max_wait = Duration::from_secs(25);
|
||||
|
||||
loop {
|
||||
let status = f(service.clone()).await?;
|
||||
match status {
|
||||
StatusCode::ACCEPTED => {
|
||||
tracing::info!("Deletion accepted, waiting to try again...");
|
||||
tokio::time::sleep(retry_period).await;
|
||||
retry_period = max_retry_period;
|
||||
}
|
||||
StatusCode::NOT_FOUND => {
|
||||
tracing::info!("Deletion complete");
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!("Unexpected status {status}");
|
||||
return json_response(status, ());
|
||||
}
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
if now + retry_period > started_at + max_wait {
|
||||
tracing::info!("Deletion timed out waiting for 404");
|
||||
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
|
||||
// the pageserver's swagger definition for this endpoint, and has the same desired
|
||||
// effect of causing the control plane to retry later.
|
||||
return json_response(StatusCode::CONFLICT, ());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
service
|
||||
.tenant_timeline_delete(tenant_id, timeline_id)
|
||||
@@ -477,6 +480,39 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
|
||||
let node_status = state.service.get_node(node_id).await?;
|
||||
|
||||
json_response(StatusCode::OK, node_status)
|
||||
}
|
||||
|
||||
async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
|
||||
state.service.start_node_drain(node_id).await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
|
||||
state.service.start_node_fill(node_id).await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_split(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
@@ -829,6 +865,16 @@ pub fn make_router(
|
||||
RequestName("control_v1_node_config"),
|
||||
)
|
||||
})
|
||||
.get("/control/v1/node/:node_id", |r| {
|
||||
named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
|
||||
})
|
||||
.put("/control/v1/node/:node_id/drain", |r| {
|
||||
named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
|
||||
})
|
||||
.put("/control/v1/node/:node_id/fill", |r| {
|
||||
named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill"))
|
||||
})
|
||||
// TODO(vlad): endpoint for cancelling drain and fill
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(
|
||||
|
||||
@@ -2,6 +2,7 @@ use serde::Serialize;
|
||||
use utils::seqwait::MonotonicCounter;
|
||||
|
||||
mod auth;
|
||||
mod background_node_operations;
|
||||
mod compute_hook;
|
||||
mod heartbeater;
|
||||
pub mod http;
|
||||
|
||||
@@ -59,6 +59,10 @@ impl Node {
|
||||
self.id
|
||||
}
|
||||
|
||||
pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
|
||||
self.scheduling
|
||||
}
|
||||
|
||||
pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) {
|
||||
self.scheduling = scheduling
|
||||
}
|
||||
@@ -141,6 +145,7 @@ impl Node {
|
||||
NodeSchedulingPolicy::Draining => MaySchedule::No,
|
||||
NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
|
||||
NodeSchedulingPolicy::Pause => MaySchedule::No,
|
||||
NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,7 +162,7 @@ impl Node {
|
||||
listen_http_port,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
scheduling: NodeSchedulingPolicy::Filling,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
availability: NodeAvailability::Offline,
|
||||
cancel: CancellationToken::new(),
|
||||
}
|
||||
|
||||
@@ -442,13 +442,15 @@ impl Persistence {
|
||||
#[tracing::instrument(skip_all, fields(node_id))]
|
||||
pub(crate) async fn re_attach(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
input_node_id: NodeId,
|
||||
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
|
||||
use crate::schema::nodes::dsl::scheduling_policy;
|
||||
use crate::schema::nodes::dsl::*;
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
|
||||
let rows_updated = diesel::update(tenant_shards)
|
||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||
.filter(generation_pageserver.eq(input_node_id.0 as i64))
|
||||
.set(generation.eq(generation + 1))
|
||||
.execute(conn)?;
|
||||
|
||||
@@ -457,9 +459,23 @@ impl Persistence {
|
||||
// TODO: UPDATE+SELECT in one query
|
||||
|
||||
let updated = tenant_shards
|
||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||
.filter(generation_pageserver.eq(input_node_id.0 as i64))
|
||||
.select(TenantShardPersistence::as_select())
|
||||
.load(conn)?;
|
||||
|
||||
// If the node went through a drain and restart phase before re-attaching,
|
||||
// then reset it's node scheduling policy to active.
|
||||
diesel::update(nodes)
|
||||
.filter(node_id.eq(input_node_id.0 as i64))
|
||||
.filter(
|
||||
scheduling_policy
|
||||
.eq(String::from(NodeSchedulingPolicy::PauseForRestart))
|
||||
.or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
|
||||
.or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
|
||||
)
|
||||
.set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::{node::Node, tenant_shard::TenantShard};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::controller_api::UtilizationScore;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
@@ -29,6 +30,8 @@ pub enum MaySchedule {
|
||||
struct SchedulerNode {
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
|
||||
shard_count: usize,
|
||||
/// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
|
||||
attached_shard_count: usize,
|
||||
|
||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||
/// from a node's availability state and scheduling policy).
|
||||
@@ -42,7 +45,9 @@ impl PartialEq for SchedulerNode {
|
||||
(MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
|
||||
);
|
||||
|
||||
may_schedule_matches && self.shard_count == other.shard_count
|
||||
may_schedule_matches
|
||||
&& self.shard_count == other.shard_count
|
||||
&& self.attached_shard_count == other.attached_shard_count
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,6 +143,15 @@ impl ScheduleContext {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum RefCountUpdate {
|
||||
PromoteSecondary,
|
||||
Attach,
|
||||
Detach,
|
||||
DemoteAttached,
|
||||
AddSecondary,
|
||||
RemoveSecondary,
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
||||
let mut scheduler_nodes = HashMap::new();
|
||||
@@ -146,6 +160,7 @@ impl Scheduler {
|
||||
node.get_id(),
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
},
|
||||
);
|
||||
@@ -171,6 +186,7 @@ impl Scheduler {
|
||||
node.get_id(),
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
},
|
||||
);
|
||||
@@ -179,7 +195,10 @@ impl Scheduler {
|
||||
for shard in shards {
|
||||
if let Some(node_id) = shard.intent.get_attached() {
|
||||
match expect_nodes.get_mut(node_id) {
|
||||
Some(node) => node.shard_count += 1,
|
||||
Some(node) => {
|
||||
node.shard_count += 1;
|
||||
node.attached_shard_count += 1;
|
||||
}
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
shard.tenant_shard_id,
|
||||
@@ -227,31 +246,80 @@ impl Scheduler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Increment the reference count of a node. This reference count is used to guide scheduling
|
||||
/// decisions, not for memory management: it represents one tenant shard whose IntentState targets
|
||||
/// this node.
|
||||
/// Update the reference counts of a node. These reference counts are used to guide scheduling
|
||||
/// decisions, not for memory management: they represent the number of tenant shard whose IntentState
|
||||
/// targets this node and the number of tenants shars whose IntentState is attached to this
|
||||
/// node.
|
||||
///
|
||||
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
|
||||
/// [`Self::new`] or [`Self::node_upsert`])
|
||||
pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
|
||||
pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
debug_assert!(false);
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
return;
|
||||
};
|
||||
|
||||
node.shard_count += 1;
|
||||
match update {
|
||||
RefCountUpdate::PromoteSecondary => {
|
||||
node.attached_shard_count += 1;
|
||||
}
|
||||
RefCountUpdate::Attach => {
|
||||
node.shard_count += 1;
|
||||
node.attached_shard_count += 1;
|
||||
}
|
||||
RefCountUpdate::Detach => {
|
||||
node.shard_count -= 1;
|
||||
node.attached_shard_count -= 1;
|
||||
}
|
||||
RefCountUpdate::DemoteAttached => {
|
||||
node.attached_shard_count -= 1;
|
||||
}
|
||||
RefCountUpdate::AddSecondary => {
|
||||
node.shard_count += 1;
|
||||
}
|
||||
RefCountUpdate::RemoveSecondary => {
|
||||
node.shard_count -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`].
|
||||
pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
// Check if the number of shards attached to a given node is lagging below
|
||||
// the cluster average. If that's the case, the node should be filled.
|
||||
pub(crate) fn compute_fill_requirement(&self, node_id: NodeId) -> usize {
|
||||
let Some(node) = self.nodes.get(&node_id) else {
|
||||
debug_assert!(false);
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
return;
|
||||
return 0;
|
||||
};
|
||||
assert!(!self.nodes.is_empty());
|
||||
let expected_attached_shards_per_node = self.expected_attached_shard_count();
|
||||
|
||||
node.shard_count -= 1;
|
||||
for (node_id, node) in self.nodes.iter() {
|
||||
tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
|
||||
}
|
||||
|
||||
if node.attached_shard_count < expected_attached_shards_per_node {
|
||||
expected_attached_shards_per_node - node.attached_shard_count
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn expected_attached_shard_count(&self) -> usize {
|
||||
let total_attached_shards: usize =
|
||||
self.nodes.values().map(|n| n.attached_shard_count).sum();
|
||||
|
||||
assert!(!self.nodes.is_empty());
|
||||
total_attached_shards / self.nodes.len()
|
||||
}
|
||||
|
||||
pub(crate) fn nodes_by_attached_shard_count(&self) -> Vec<(NodeId, usize)> {
|
||||
self.nodes
|
||||
.iter()
|
||||
.map(|(node_id, stats)| (*node_id, stats.attached_shard_count))
|
||||
.sorted_by(|lhs, rhs| Ord::cmp(&lhs.1, &rhs.1).reverse())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub(crate) fn node_upsert(&mut self, node: &Node) {
|
||||
@@ -263,6 +331,7 @@ impl Scheduler {
|
||||
Vacant(entry) => {
|
||||
entry.insert(SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
});
|
||||
}
|
||||
@@ -385,6 +454,11 @@ impl Scheduler {
|
||||
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
|
||||
self.nodes.get(&node_id).unwrap().shard_count
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
|
||||
self.nodes.get(&node_id).unwrap().attached_shard_count
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -437,18 +511,28 @@ mod tests {
|
||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
||||
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
|
||||
t1_intent.push_secondary(&mut scheduler, scheduled);
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
|
||||
|
||||
t1_intent.clear(&mut scheduler);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
|
||||
|
||||
let total_attached = scheduler.get_node_attached_shard_count(NodeId(1))
|
||||
+ scheduler.get_node_attached_shard_count(NodeId(2));
|
||||
assert_eq!(total_attached, 1);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Dropping an IntentState without clearing it causes a panic in debug mode,
|
||||
@@ -459,8 +543,12 @@ mod tests {
|
||||
assert!(result.is_err());
|
||||
} else {
|
||||
t2_intent.clear(&mut scheduler);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 0);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 0);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -8,13 +8,17 @@ use std::{
|
||||
};
|
||||
|
||||
use crate::{
|
||||
background_node_operations::{
|
||||
Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
|
||||
},
|
||||
compute_hook::NotifyError,
|
||||
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
|
||||
persistence::{AbortShardSplitStatus, TenantFilter},
|
||||
reconciler::{ReconcileError, ReconcileUnits},
|
||||
scheduler::{ScheduleContext, ScheduleMode},
|
||||
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
|
||||
tenant_shard::{
|
||||
MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
|
||||
MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
|
||||
ScheduleOptimizationAction,
|
||||
},
|
||||
};
|
||||
use anyhow::Context;
|
||||
@@ -134,6 +138,8 @@ struct ServiceState {
|
||||
|
||||
scheduler: Scheduler,
|
||||
|
||||
ongoing_operation: Option<OperationHandler>,
|
||||
|
||||
/// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
|
||||
delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
|
||||
}
|
||||
@@ -185,6 +191,7 @@ impl ServiceState {
|
||||
tenants,
|
||||
nodes: Arc::new(nodes),
|
||||
scheduler,
|
||||
ongoing_operation: None,
|
||||
delayed_reconcile_rx,
|
||||
}
|
||||
}
|
||||
@@ -296,6 +303,17 @@ impl From<ReconcileWaitError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OperationError> for ApiError {
|
||||
fn from(value: OperationError) -> Self {
|
||||
match value {
|
||||
OperationError::NodeStateChanged(err) => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(err))
|
||||
}
|
||||
OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
enum TenantCreateOrUpdate {
|
||||
Create(TenantCreateRequest),
|
||||
@@ -1562,15 +1580,32 @@ impl Service {
|
||||
// Setting a node active unblocks any Reconcilers that might write to the location config API,
|
||||
// but those requests will not be accepted by the node until it has finished processing
|
||||
// the re-attach response.
|
||||
//
|
||||
// Additionally, reset the nodes scheduling policy to match the conditional update done
|
||||
// in [`Persistence::re_attach`].
|
||||
if let Some(node) = nodes.get(&reattach_req.node_id) {
|
||||
if !node.is_available() {
|
||||
let reset_scheduling = matches!(
|
||||
node.get_scheduling(),
|
||||
NodeSchedulingPolicy::PauseForRestart
|
||||
| NodeSchedulingPolicy::Draining
|
||||
| NodeSchedulingPolicy::Filling
|
||||
);
|
||||
|
||||
if !node.is_available() || reset_scheduling {
|
||||
let mut new_nodes = (**nodes).clone();
|
||||
if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
if !node.is_available() {
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
}
|
||||
|
||||
if reset_scheduling {
|
||||
node.set_scheduling(NodeSchedulingPolicy::Active);
|
||||
}
|
||||
|
||||
scheduler.node_upsert(node);
|
||||
let new_nodes = Arc::new(new_nodes);
|
||||
*nodes = new_nodes;
|
||||
}
|
||||
let new_nodes = Arc::new(new_nodes);
|
||||
*nodes = new_nodes;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1851,6 +1886,25 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Same as [`Service::await_waiters`], but returns the waiters which are still
|
||||
/// in progress
|
||||
async fn await_waiters_remainder(
|
||||
&self,
|
||||
waiters: Vec<ReconcilerWaiter>,
|
||||
timeout: Duration,
|
||||
) -> Vec<ReconcilerWaiter> {
|
||||
let deadline = Instant::now().checked_add(timeout).unwrap();
|
||||
for waiter in waiters.iter() {
|
||||
let timeout = deadline.duration_since(Instant::now());
|
||||
let _ = waiter.wait_timeout(timeout).await;
|
||||
}
|
||||
|
||||
waiters
|
||||
.into_iter()
|
||||
.filter(|waiter| matches!(waiter.get_status(), ReconcilerStatus::InProgress))
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
/// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
|
||||
/// and transform it into either a tenant creation of a series of shard updates.
|
||||
///
|
||||
@@ -2376,61 +2430,80 @@ impl Service {
|
||||
let _tenant_lock =
|
||||
trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
|
||||
|
||||
self.ensure_attached_wait(tenant_id).await?;
|
||||
|
||||
// TODO: refactor into helper
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
|
||||
// Detach all shards
|
||||
let (detach_waiters, shard_ids, node) = {
|
||||
let mut shard_ids = Vec::new();
|
||||
let mut detach_waiters = Vec::new();
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
for (tenant_shard_id, shard) in
|
||||
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
tenants.range_mut(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
let node_id = shard.intent.get_attached().ok_or_else(|| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
|
||||
})?;
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
shard_ids.push(*tenant_shard_id);
|
||||
|
||||
targets.push((*tenant_shard_id, node.clone()));
|
||||
// Update the tenant's intent to remove all attachments
|
||||
shard.policy = PlacementPolicy::Detached;
|
||||
shard
|
||||
.schedule(scheduler, &mut ScheduleContext::default())
|
||||
.expect("De-scheduling is infallible");
|
||||
debug_assert!(shard.intent.get_attached().is_none());
|
||||
debug_assert!(shard.intent.get_secondary().is_empty());
|
||||
|
||||
if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
|
||||
detach_waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
targets
|
||||
|
||||
// Pick an arbitrary node to use for remote deletions (does not have to be where the tenant
|
||||
// was attached, just has to be able to see the S3 content)
|
||||
let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
|
||||
let node = nodes
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while lock is active");
|
||||
(detach_waiters, shard_ids, node.clone())
|
||||
};
|
||||
|
||||
// Phase 1: delete on the pageservers
|
||||
let mut any_pending = false;
|
||||
for (tenant_shard_id, node) in targets {
|
||||
let client = PageserverClient::new(
|
||||
node.get_id(),
|
||||
node.base_url(),
|
||||
self.config.jwt_token.as_deref(),
|
||||
);
|
||||
// TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
|
||||
// surface immediately as an error to our caller.
|
||||
let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Error deleting shard {tenant_shard_id} on node {node}: {e}",
|
||||
))
|
||||
})?;
|
||||
tracing::info!(
|
||||
"Shard {tenant_shard_id} on node {node}, delete returned {}",
|
||||
status
|
||||
);
|
||||
if status == StatusCode::ACCEPTED {
|
||||
any_pending = true;
|
||||
}
|
||||
if let Err(e) = self.await_waiters(detach_waiters, RECONCILE_TIMEOUT).await {
|
||||
// Failing to detach shouldn't hold up deletion, e.g. if a node is offline we should be able
|
||||
// to use some other node to run the remote deletion.
|
||||
tracing::warn!("Failed to detach some locations: {e}");
|
||||
}
|
||||
|
||||
if any_pending {
|
||||
// Caller should call us again later. When we eventually see 404s from
|
||||
// all the shards, we may proceed to delete our records of the tenant.
|
||||
tracing::info!(
|
||||
"Tenant {} has some shards pending deletion, returning 202",
|
||||
tenant_id
|
||||
);
|
||||
return Ok(StatusCode::ACCEPTED);
|
||||
let locations = shard_ids
|
||||
.into_iter()
|
||||
.map(|s| (s, node.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
let results = self.tenant_for_shards_api(
|
||||
locations,
|
||||
|tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await },
|
||||
1,
|
||||
3,
|
||||
RECONCILE_TIMEOUT,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
for result in results {
|
||||
match result {
|
||||
Ok(StatusCode::ACCEPTED) => {
|
||||
// This could happen if we failed detach above, and hit a pageserver where the tenant
|
||||
// is still attached: it will accept the deletion in the background
|
||||
tracing::warn!(
|
||||
"Unexpectedly still attached on {}, client should retry",
|
||||
node
|
||||
);
|
||||
return Ok(StatusCode::ACCEPTED);
|
||||
}
|
||||
Ok(_) => {}
|
||||
Err(mgmt_api::Error::Cancelled) => {
|
||||
return Err(ApiError::ShuttingDown);
|
||||
}
|
||||
Err(e) => {
|
||||
// This is unexpected: remote deletion should be infallible, unless the object store
|
||||
// at large is unavailable.
|
||||
tracing::error!("Error deleting via node {}: {e}", node);
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
|
||||
@@ -4109,6 +4182,18 @@ impl Service {
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_node(&self, node_id: NodeId) -> Result<Node, ApiError> {
|
||||
self.inner
|
||||
.read()
|
||||
.unwrap()
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
.cloned()
|
||||
.ok_or(ApiError::NotFound(
|
||||
format!("Node {node_id} not registered").into(),
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) async fn node_register(
|
||||
&self,
|
||||
register_req: NodeRegisterRequest,
|
||||
@@ -4263,9 +4348,6 @@ impl Service {
|
||||
|
||||
if let Some(scheduling) = scheduling {
|
||||
node.set_scheduling(scheduling);
|
||||
|
||||
// TODO: once we have a background scheduling ticker for fill/drain, kick it
|
||||
// to wake up and start working.
|
||||
}
|
||||
|
||||
// Update the scheduler, in case the elegibility of the node for new shards has changed
|
||||
@@ -4293,7 +4375,7 @@ impl Service {
|
||||
continue;
|
||||
}
|
||||
|
||||
if tenant_shard.intent.demote_attached(node_id) {
|
||||
if tenant_shard.intent.demote_attached(scheduler, node_id) {
|
||||
tenant_shard.sequence = tenant_shard.sequence.next();
|
||||
|
||||
// TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
|
||||
@@ -4340,7 +4422,7 @@ impl Service {
|
||||
// TODO: in the background, we should balance work back onto this pageserver
|
||||
}
|
||||
AvailabilityTransition::Unchanged => {
|
||||
tracing::debug!("Node {} no change during config", node_id);
|
||||
tracing::debug!("Node {} no availability change during config", node_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4349,6 +4431,176 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn start_node_drain(
|
||||
self: &Arc<Self>,
|
||||
node_id: NodeId,
|
||||
) -> Result<(), ApiError> {
|
||||
let (ongoing_op, node_available, node_policy, schedulable_nodes_count) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let nodes = &locked.nodes;
|
||||
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
|
||||
anyhow::anyhow!("Node {} not registered", node_id).into(),
|
||||
))?;
|
||||
let schedulable_nodes_count = nodes
|
||||
.iter()
|
||||
.filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_)))
|
||||
.count();
|
||||
|
||||
(
|
||||
locked
|
||||
.ongoing_operation
|
||||
.as_ref()
|
||||
.map(|ongoing| ongoing.operation),
|
||||
node.is_available(),
|
||||
node.get_scheduling(),
|
||||
schedulable_nodes_count,
|
||||
)
|
||||
};
|
||||
|
||||
if let Some(ongoing) = ongoing_op {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Background operation already ongoing for node: {}", ongoing).into(),
|
||||
));
|
||||
}
|
||||
|
||||
if !node_available {
|
||||
return Err(ApiError::ResourceUnavailable(
|
||||
format!("Node {node_id} is currently unavailable").into(),
|
||||
));
|
||||
}
|
||||
|
||||
if schedulable_nodes_count == 0 {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
"No other schedulable nodes to drain to".into(),
|
||||
));
|
||||
}
|
||||
|
||||
match node_policy {
|
||||
NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
|
||||
self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
|
||||
operation: Operation::Drain(Drain { node_id }),
|
||||
cancel: cancel.clone(),
|
||||
});
|
||||
|
||||
tokio::task::spawn({
|
||||
let service = self.clone();
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
let prev = service.inner.write().unwrap().ongoing_operation.take();
|
||||
|
||||
if let Some(Operation::Drain(removed_drain)) = prev.map(|h| h.operation) {
|
||||
assert_eq!(removed_drain.node_id, node_id, "We always take the same operation");
|
||||
} else {
|
||||
panic!("We always remove the same operation")
|
||||
}
|
||||
}
|
||||
service.drain_node(node_id, cancel).await
|
||||
}
|
||||
});
|
||||
}
|
||||
NodeSchedulingPolicy::Draining => {
|
||||
return Err(ApiError::Conflict(format!(
|
||||
"Node {node_id} has drain in progress"
|
||||
)));
|
||||
}
|
||||
policy => {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Node {node_id} cannot be drained due to {policy:?} policy").into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn start_node_fill(self: &Arc<Self>, node_id: NodeId) -> Result<(), ApiError> {
|
||||
let (ongoing_op, node_available, node_policy, total_nodes_count) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let nodes = &locked.nodes;
|
||||
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
|
||||
anyhow::anyhow!("Node {} not registered", node_id).into(),
|
||||
))?;
|
||||
|
||||
(
|
||||
locked
|
||||
.ongoing_operation
|
||||
.as_ref()
|
||||
.map(|ongoing| ongoing.operation),
|
||||
node.is_available(),
|
||||
node.get_scheduling(),
|
||||
nodes.len(),
|
||||
)
|
||||
};
|
||||
|
||||
if let Some(ongoing) = ongoing_op {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Background operation already ongoing for node: {}", ongoing).into(),
|
||||
));
|
||||
}
|
||||
|
||||
if !node_available {
|
||||
return Err(ApiError::ResourceUnavailable(
|
||||
format!("Node {node_id} is currently unavailable").into(),
|
||||
));
|
||||
}
|
||||
|
||||
if total_nodes_count <= 1 {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
"No other nodes to fill from".into(),
|
||||
));
|
||||
}
|
||||
|
||||
match node_policy {
|
||||
NodeSchedulingPolicy::Active => {
|
||||
self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling))
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
|
||||
operation: Operation::Fill(Fill { node_id }),
|
||||
cancel: cancel.clone(),
|
||||
});
|
||||
|
||||
tokio::task::spawn({
|
||||
let service = self.clone();
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
let prev = service.inner.write().unwrap().ongoing_operation.take();
|
||||
|
||||
if let Some(Operation::Fill(removed_fill)) = prev.map(|h| h.operation) {
|
||||
assert_eq!(removed_fill.node_id, node_id, "We always take the same operation");
|
||||
} else {
|
||||
panic!("We always remove the same operation")
|
||||
}
|
||||
}
|
||||
|
||||
service.fill_node(node_id, cancel).await
|
||||
}
|
||||
});
|
||||
}
|
||||
NodeSchedulingPolicy::Filling => {
|
||||
return Err(ApiError::Conflict(format!(
|
||||
"Node {node_id} has fill in progress"
|
||||
)));
|
||||
}
|
||||
policy => {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Node {node_id} cannot be filled due to {policy:?} policy").into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Helper for methods that will try and call pageserver APIs for
|
||||
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
|
||||
/// is attached somewhere.
|
||||
@@ -4933,4 +5185,268 @@ impl Service {
|
||||
// to complete.
|
||||
self.gate.close().await;
|
||||
}
|
||||
|
||||
/// Drain a node by moving the shards attached to it as primaries.
|
||||
/// This is a long running operation and it should run as a separate Tokio task.
|
||||
pub(crate) async fn drain_node(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), OperationError> {
|
||||
tracing::info!(%node_id, "Starting drain background operation");
|
||||
|
||||
let mut last_inspected_shard: Option<TenantShardId> = None;
|
||||
let mut inspected_all_shards = false;
|
||||
let mut waiters = Vec::new();
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
|
||||
while !inspected_all_shards {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(OperationError::Cancelled);
|
||||
}
|
||||
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
|
||||
format!("node {node_id} was removed").into(),
|
||||
))?;
|
||||
|
||||
let current_policy = node.get_scheduling();
|
||||
if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
|
||||
// TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
|
||||
// about it
|
||||
return Err(OperationError::NodeStateChanged(
|
||||
format!("node {node_id} changed state to {current_policy:?}").into(),
|
||||
));
|
||||
}
|
||||
|
||||
let mut cursor = tenants.iter_mut().skip_while({
|
||||
let skip_past = last_inspected_shard;
|
||||
move |(tid, _)| match skip_past {
|
||||
Some(last) => **tid != last,
|
||||
None => false,
|
||||
}
|
||||
});
|
||||
|
||||
while waiters.len() < MAX_RECONCILES_PER_OPERATION {
|
||||
let (tid, tenant_shard) = match cursor.next() {
|
||||
Some(some) => some,
|
||||
None => {
|
||||
inspected_all_shards = true;
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
if tenant_shard.intent.demote_attached(scheduler, node_id) {
|
||||
match tenant_shard.schedule(scheduler, &mut schedule_context) {
|
||||
Err(e) => {
|
||||
tracing::warn!(%tid, "Scheduling error when draining pageserver {} : {e}", node_id);
|
||||
}
|
||||
Ok(()) => {
|
||||
let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
|
||||
if let Some(some) = waiter {
|
||||
waiters.push(some);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
last_inspected_shard = Some(*tid);
|
||||
}
|
||||
}
|
||||
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
|
||||
.await;
|
||||
}
|
||||
|
||||
while !waiters.is_empty() {
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
|
||||
.await;
|
||||
}
|
||||
|
||||
// At this point we have done the best we could to drain shards from this node.
|
||||
// Set the node scheduling policy to `[NodeSchedulingPolicy::PauseForRestart]`
|
||||
// to complete the drain.
|
||||
if let Err(err) = self
|
||||
.node_configure(node_id, None, Some(NodeSchedulingPolicy::PauseForRestart))
|
||||
.await
|
||||
{
|
||||
// This is not fatal. Anything that is polling the node scheduling policy to detect
|
||||
// the end of the drain operations will hang, but all such places should enforce an
|
||||
// overall timeout. The scheduling policy will be updated upon node re-attach and/or
|
||||
// by the counterpart fill operation.
|
||||
tracing::warn!(%node_id, "Failed to finalise drain by setting scheduling policy: {err}");
|
||||
}
|
||||
|
||||
tracing::info!(%node_id, "Completed drain background operation");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
|
||||
/// 1. The node should be filled until it reaches the expected cluster average of
|
||||
/// attached shards. If there are not enough secondaries on the node, the plan stops early.
|
||||
/// 2. Select tenant shards to promote such that the number of attached shards is balanced
|
||||
/// throughout the cluster. We achieve this by picking tenant shards from each node,
|
||||
/// starting from the ones with the largest number of attached shards, until the node
|
||||
/// reaches the expected cluster average.
|
||||
fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
|
||||
|
||||
let mut tids_by_node = locked
|
||||
.tenants
|
||||
.iter_mut()
|
||||
.filter_map(|(tid, tenant_shard)| {
|
||||
if tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
return Some((*primary, *tid));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
})
|
||||
.into_group_map();
|
||||
|
||||
let expected_attached = locked.scheduler.expected_attached_shard_count();
|
||||
let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
|
||||
|
||||
let mut plan = Vec::new();
|
||||
for (node_id, attached) in nodes_by_load {
|
||||
if plan.len() >= fill_requirement
|
||||
|| tids_by_node.is_empty()
|
||||
|| attached <= expected_attached
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
let can_take = attached - expected_attached;
|
||||
let mut remove_node = false;
|
||||
for _ in 0..can_take {
|
||||
match tids_by_node.get_mut(&node_id) {
|
||||
Some(tids) => match tids.pop() {
|
||||
Some(tid) => {
|
||||
plan.push(tid);
|
||||
}
|
||||
None => {
|
||||
remove_node = true;
|
||||
break;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if remove_node {
|
||||
tids_by_node.remove(&node_id);
|
||||
}
|
||||
}
|
||||
|
||||
plan
|
||||
}
|
||||
|
||||
/// Fill a node by promoting its secondaries until the cluster is balanced
|
||||
/// with regards to attached shard counts. Note that this operation only
|
||||
/// makes sense as a counterpart to the drain implemented in [`Service::drain_node`].
|
||||
/// This is a long running operation and it should run as a separate Tokio task.
|
||||
pub(crate) async fn fill_node(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), OperationError> {
|
||||
// TODO(vlad): Currently this operates on the assumption that all
|
||||
// secondaries are warm. This is not always true (e.g. we just migrated the
|
||||
// tenant). Take that into consideration by checking the secondary status.
|
||||
|
||||
tracing::info!(%node_id, "Starting fill background operation");
|
||||
|
||||
let mut tids_to_promote = self.fill_node_plan(node_id);
|
||||
|
||||
let mut waiters = Vec::new();
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
|
||||
// Execute the plan we've composed above. Before aplying each move from the plan,
|
||||
// we validate to ensure that it has not gone stale in the meantime.
|
||||
while !tids_to_promote.is_empty() {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(OperationError::Cancelled);
|
||||
}
|
||||
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
|
||||
format!("node {node_id} was removed").into(),
|
||||
))?;
|
||||
|
||||
let current_policy = node.get_scheduling();
|
||||
if !matches!(current_policy, NodeSchedulingPolicy::Filling) {
|
||||
// TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
|
||||
// about it
|
||||
return Err(OperationError::NodeStateChanged(
|
||||
format!("node {node_id} changed state to {current_policy:?}").into(),
|
||||
));
|
||||
}
|
||||
|
||||
while waiters.len() < MAX_RECONCILES_PER_OPERATION {
|
||||
if let Some(tid) = tids_to_promote.pop() {
|
||||
if let Some(tenant_shard) = tenants.get_mut(&tid) {
|
||||
// If the node being filled is not a secondary anymore,
|
||||
// skip the promotion.
|
||||
if !tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
tenant_shard.intent.promote_attached(scheduler, node_id);
|
||||
match tenant_shard.schedule(scheduler, &mut schedule_context) {
|
||||
Err(e) => {
|
||||
tracing::warn!(%tid, "Scheduling error when filling pageserver {} : {e}", node_id);
|
||||
}
|
||||
Ok(()) => {
|
||||
if let Some(waiter) =
|
||||
self.maybe_reconcile_shard(tenant_shard, nodes)
|
||||
{
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
|
||||
.await;
|
||||
}
|
||||
|
||||
while !waiters.is_empty() {
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
|
||||
.await;
|
||||
}
|
||||
|
||||
if let Err(err) = self
|
||||
.node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
|
||||
.await
|
||||
{
|
||||
// This isn't a huge issue since the filling process starts upon request. However, it
|
||||
// will prevent the next drain from starting. The only case in which this can fail
|
||||
// is database unavailability. Such a case will require manual intervention.
|
||||
tracing::error!(%node_id, "Failed to finalise fill by setting scheduling policy: {err}");
|
||||
}
|
||||
|
||||
tracing::info!(%node_id, "Completed fill background operation");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,9 +8,11 @@ use crate::{
|
||||
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
|
||||
persistence::TenantShardPersistence,
|
||||
reconciler::ReconcileUnits,
|
||||
scheduler::{AffinityScore, MaySchedule, ScheduleContext},
|
||||
scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
|
||||
};
|
||||
use pageserver_api::controller_api::{
|
||||
NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
|
||||
};
|
||||
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
@@ -153,7 +155,7 @@ impl IntentState {
|
||||
}
|
||||
pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
|
||||
if let Some(node_id) = node_id {
|
||||
scheduler.node_inc_ref(node_id);
|
||||
scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach);
|
||||
}
|
||||
Self {
|
||||
attached: node_id,
|
||||
@@ -164,10 +166,10 @@ impl IntentState {
|
||||
pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
|
||||
if self.attached != new_attached {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
|
||||
}
|
||||
if let Some(new_attached) = &new_attached {
|
||||
scheduler.node_inc_ref(*new_attached);
|
||||
scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach);
|
||||
}
|
||||
self.attached = new_attached;
|
||||
}
|
||||
@@ -177,22 +179,27 @@ impl IntentState {
|
||||
/// secondary to attached while maintaining the scheduler's reference counts.
|
||||
pub(crate) fn promote_attached(
|
||||
&mut self,
|
||||
_scheduler: &mut Scheduler,
|
||||
scheduler: &mut Scheduler,
|
||||
promote_secondary: NodeId,
|
||||
) {
|
||||
// If we call this with a node that isn't in secondary, it would cause incorrect
|
||||
// scheduler reference counting, since we assume the node is already referenced as a secondary.
|
||||
debug_assert!(self.secondary.contains(&promote_secondary));
|
||||
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
self.secondary.retain(|n| n != &promote_secondary);
|
||||
|
||||
let demoted = self.attached;
|
||||
self.attached = Some(promote_secondary);
|
||||
|
||||
scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary);
|
||||
if let Some(demoted) = demoted {
|
||||
scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
|
||||
debug_assert!(!self.secondary.contains(&new_secondary));
|
||||
scheduler.node_inc_ref(new_secondary);
|
||||
scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary);
|
||||
self.secondary.push(new_secondary);
|
||||
}
|
||||
|
||||
@@ -200,27 +207,27 @@ impl IntentState {
|
||||
pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
|
||||
let index = self.secondary.iter().position(|n| *n == node_id);
|
||||
if let Some(index) = index {
|
||||
scheduler.node_dec_ref(node_id);
|
||||
scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
|
||||
self.secondary.remove(index);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
|
||||
for secondary in self.secondary.drain(..) {
|
||||
scheduler.node_dec_ref(secondary);
|
||||
scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary);
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove the last secondary node from the list of secondaries
|
||||
pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
|
||||
if let Some(node_id) = self.secondary.pop() {
|
||||
scheduler.node_dec_ref(node_id);
|
||||
scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
|
||||
}
|
||||
|
||||
self.clear_secondary(scheduler);
|
||||
@@ -251,12 +258,11 @@ impl IntentState {
|
||||
/// forget the location on the offline node.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
|
||||
pub(crate) fn demote_attached(&mut self, scheduler: &mut Scheduler, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -307,6 +313,12 @@ pub(crate) struct ReconcilerWaiter {
|
||||
seq: Sequence,
|
||||
}
|
||||
|
||||
pub(crate) enum ReconcilerStatus {
|
||||
Done,
|
||||
Failed,
|
||||
InProgress,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum ReconcileWaitError {
|
||||
#[error("Timeout waiting for shard {0}")]
|
||||
@@ -369,6 +381,16 @@ impl ReconcilerWaiter {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_status(&self) -> ReconcilerStatus {
|
||||
if self.seq_wait.would_wait_for(self.seq).is_err() {
|
||||
ReconcilerStatus::Done
|
||||
} else if self.error_seq_wait.would_wait_for(self.seq).is_err() {
|
||||
ReconcilerStatus::Failed
|
||||
} else {
|
||||
ReconcilerStatus::InProgress
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Having spawned a reconciler task, the tenant shard's state will carry enough
|
||||
@@ -593,7 +615,7 @@ impl TenantShard {
|
||||
Secondary => {
|
||||
if let Some(node_id) = self.intent.get_attached() {
|
||||
// Populate secondary by demoting the attached node
|
||||
self.intent.demote_attached(*node_id);
|
||||
self.intent.demote_attached(scheduler, *node_id);
|
||||
modified = true;
|
||||
} else if self.intent.secondary.is_empty() {
|
||||
// Populate secondary by scheduling a fresh node
|
||||
@@ -648,13 +670,17 @@ impl TenantShard {
|
||||
let mut scores = all_pageservers
|
||||
.iter()
|
||||
.flat_map(|node_id| {
|
||||
if matches!(
|
||||
nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.may_schedule())
|
||||
.unwrap_or(MaySchedule::No),
|
||||
MaySchedule::No
|
||||
let node = nodes.get(node_id);
|
||||
if node.is_none() {
|
||||
None
|
||||
} else if matches!(
|
||||
node.unwrap().get_scheduling(),
|
||||
NodeSchedulingPolicy::Filling
|
||||
) {
|
||||
// If the node is currently filling, don't count it as a candidate to avoid,
|
||||
// racing with the background fill.
|
||||
None
|
||||
} else if matches!(node.unwrap().may_schedule(), MaySchedule::No) {
|
||||
None
|
||||
} else {
|
||||
let affinity_score = schedule_context.get_node_affinity(*node_id);
|
||||
@@ -783,7 +809,7 @@ impl TenantShard {
|
||||
old_attached_node_id,
|
||||
new_attached_node_id,
|
||||
}) => {
|
||||
self.intent.demote_attached(old_attached_node_id);
|
||||
self.intent.demote_attached(scheduler, old_attached_node_id);
|
||||
self.intent
|
||||
.promote_attached(scheduler, new_attached_node_id);
|
||||
}
|
||||
@@ -1321,7 +1347,9 @@ pub(crate) mod tests {
|
||||
assert_ne!(attached_node_id, secondary_node_id);
|
||||
|
||||
// Notifying the attached node is offline should demote it to a secondary
|
||||
let changed = tenant_shard.intent.demote_attached(attached_node_id);
|
||||
let changed = tenant_shard
|
||||
.intent
|
||||
.demote_attached(&mut scheduler, attached_node_id);
|
||||
assert!(changed);
|
||||
assert!(tenant_shard.intent.attached.is_none());
|
||||
assert_eq!(tenant_shard.intent.secondary.len(), 2);
|
||||
@@ -1604,7 +1632,14 @@ pub(crate) mod tests {
|
||||
|
||||
// We should see equal number of locations on the two nodes.
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
|
||||
// Scheduling does not consider the number of attachments picking the initial
|
||||
// pageserver to attach to (hence the assertion that all primaries are on the
|
||||
// same node)
|
||||
// TODO: Tweak the scheduling to evenly distribute attachments for new shards.
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
|
||||
|
||||
// Add another two nodes: we should see the shards spread out when their optimize
|
||||
// methods are called
|
||||
@@ -1613,9 +1648,16 @@ pub(crate) mod tests {
|
||||
optimize_til_idle(&nodes, &mut scheduler, &mut shards);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1);
|
||||
|
||||
for shard in shards.iter_mut() {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
|
||||
@@ -285,6 +285,21 @@ def test_foobar(neon_env_builder: NeonEnvBuilder):
|
||||
...
|
||||
```
|
||||
|
||||
The env includes a default tenant and timeline. Therefore, you do not need to create your own
|
||||
tenant/timeline for testing.
|
||||
|
||||
```python
|
||||
def test_foobar2(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start() # Start the environment
|
||||
with env.endpoints.create_start("main") as endpoint:
|
||||
# Start the compute endpoint
|
||||
client = env.pageserver.http_client() # Get the pageserver client
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)
|
||||
```
|
||||
|
||||
For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html
|
||||
|
||||
At the end of a test, all the nodes in the environment are automatically stopped, so you
|
||||
|
||||
@@ -2213,6 +2213,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def node_drain(self, node_id):
|
||||
log.info(f"node_drain({node_id})")
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def node_fill(self, node_id):
|
||||
log.info(f"node_fill({node_id})")
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def node_status(self, node_id):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def node_list(self):
|
||||
response = self.request(
|
||||
"GET",
|
||||
@@ -3386,7 +3410,7 @@ def static_proxy(
|
||||
yield proxy
|
||||
|
||||
|
||||
class Endpoint(PgProtocol):
|
||||
class Endpoint(PgProtocol, LogUtils):
|
||||
"""An object representing a Postgres compute endpoint managed by the control plane."""
|
||||
|
||||
def __init__(
|
||||
@@ -3452,6 +3476,7 @@ class Endpoint(PgProtocol):
|
||||
)
|
||||
path = Path("endpoints") / self.endpoint_id / "pgdata"
|
||||
self.pgdata_dir = os.path.join(self.env.repo_dir, path)
|
||||
self.logfile = self.endpoint_path() / "compute.log"
|
||||
|
||||
config_lines = config_lines or []
|
||||
|
||||
@@ -3998,6 +4023,30 @@ class S3Scrubber:
|
||||
)
|
||||
log.info(f"tenant-snapshot output: {stdout}")
|
||||
|
||||
def pageserver_physical_gc(
|
||||
self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None
|
||||
):
|
||||
args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]
|
||||
|
||||
if tenant_ids is None:
|
||||
tenant_ids = []
|
||||
|
||||
for tenant_id in tenant_ids:
|
||||
args.extend(["--tenant-id", str(tenant_id)])
|
||||
|
||||
stdout = self.scrubber_cli(
|
||||
args,
|
||||
timeout=30,
|
||||
)
|
||||
try:
|
||||
return json.loads(stdout)
|
||||
except:
|
||||
log.error(
|
||||
"Failed to decode JSON output from `pageserver-physical_gc`. Dumping stdout:"
|
||||
)
|
||||
log.error(stdout)
|
||||
raise
|
||||
|
||||
|
||||
def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path:
|
||||
"""Compute the path to a working directory for an individual test."""
|
||||
|
||||
@@ -630,12 +630,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
timestamp: datetime,
|
||||
**kwargs,
|
||||
):
|
||||
log.info(
|
||||
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
|
||||
)
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
|
||||
**kwargs,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
@@ -921,3 +923,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json() # type: ignore
|
||||
|
||||
def perf_info(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(f"Requesting perf info: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/perf_info",
|
||||
)
|
||||
log.info(f"Got perf info response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
@@ -313,7 +313,7 @@ def assert_prefix_empty(
|
||||
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
|
||||
# this seems like a mock_s3 issue
|
||||
log.warning(
|
||||
f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
|
||||
f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
|
||||
)
|
||||
keys = 0
|
||||
elif keys != 0 and len(objects) == 0:
|
||||
|
||||
@@ -171,6 +171,8 @@ class S3Storage:
|
||||
"""Is this MOCK_S3 (false) or REAL_S3 (true)"""
|
||||
real: bool
|
||||
endpoint: Optional[str] = None
|
||||
"""formatting deserialized with humantime crate, for example "1s"."""
|
||||
custom_timeout: Optional[str] = None
|
||||
|
||||
def access_env_vars(self) -> Dict[str, str]:
|
||||
if self.aws_profile is not None:
|
||||
@@ -208,6 +210,9 @@ class S3Storage:
|
||||
if self.endpoint is not None:
|
||||
rv["endpoint"] = self.endpoint
|
||||
|
||||
if self.custom_timeout is not None:
|
||||
rv["timeout"] = self.custom_timeout
|
||||
|
||||
return rv
|
||||
|
||||
def to_toml_inline_table(self) -> str:
|
||||
|
||||
@@ -582,3 +582,20 @@ class PropagatingThread(threading.Thread):
|
||||
if self.exc:
|
||||
raise self.exc
|
||||
return self.ret
|
||||
|
||||
|
||||
def human_bytes(amt: float) -> str:
|
||||
"""
|
||||
Render a bytes amount into nice IEC bytes string.
|
||||
"""
|
||||
|
||||
suffixes = ["", "Ki", "Mi", "Gi"]
|
||||
|
||||
last = suffixes[-1]
|
||||
|
||||
for name in suffixes:
|
||||
if amt < 1024 or name == last:
|
||||
return f"{int(round(amt))} {name}B"
|
||||
amt = amt / 1024
|
||||
|
||||
raise RuntimeError("unreachable")
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user