Compare commits

..

94 Commits

Author SHA1 Message Date
John Spray
c63a952b78 Implement validation of generations before delete 2023-08-30 17:44:10 +01:00
John Spray
35e4b43531 Hook deletion queue into generations 2023-08-30 15:35:51 +01:00
John Spray
584c0d3c7b Make remote_layer_path take Generation instead of layer metadata 2023-08-30 15:13:00 +01:00
John Spray
84023207ce Merge branch 'jcsp/deletion-queue' into jcsp/generation-numbers 2023-08-30 15:07:35 +01:00
John Spray
35fa75699b switch deletion queue to local storage 2023-08-30 12:21:29 +01:00
John Spray
f77aa463c6 clippy 2023-08-30 10:37:06 +01:00
John Spray
4492d40c37 Merge remote-tracking branch 'upstream/main' into jcsp/deletion-queue 2023-08-30 10:34:16 +01:00
John Spray
2f58f39648 Revert "libs: make backoff::retry() take a cancellation token"
This reverts commit 8c2ff87f1a.
2023-08-30 10:26:15 +01:00
John Spray
10b85c0d9a fixup index_part loading 2023-08-29 17:26:08 +01:00
John Spray
cd6367b5ae fixup control_plane attach hook 2023-08-29 17:18:28 +01:00
John Spray
79f9f7c5f8 fixup control_api types 2023-08-29 17:18:28 +01:00
John Spray
ef5ce1635c fixup attach API 2023-08-29 17:18:28 +01:00
John Spray
5aecd8c4fd tests: enable generations in neon_fixture 2023-08-29 17:18:28 +01:00
John Spray
5266bf4552 remote_storage: fix LocalFs list_files 2023-08-29 17:18:28 +01:00
John Spray
a1bcad2382 DNM unit test for index part download 2023-08-29 17:18:28 +01:00
John Spray
4dd60bf7cd pageserver: generation-aware index_part.json loading 2023-08-29 17:18:28 +01:00
John Spray
3eff65618d control_plane: implement attach hook 2023-08-29 17:18:28 +01:00
John Spray
265d3b4352 pageserver: if control plane API is disabled, ignore generations 2023-08-29 17:18:28 +01:00
John Spray
000330054b pageserver: require attachment generation if control plane API is set 2023-08-29 17:18:28 +01:00
John Spray
ddb6453f56 neon_local: manage attachment_service 2023-08-29 17:18:28 +01:00
John Spray
bc95b8f1f5 pageserver: call into control plane on startup 2023-08-29 17:18:28 +01:00
John Spray
5b7d3e39d6 Move pageserver control plane API types into libs/ 2023-08-29 17:18:28 +01:00
John Spray
034bebcfcd pageserver: add control_plane_api conf 2023-08-29 17:18:28 +01:00
John Spray
9e0e2a2a9a Stub of generations API 2023-08-29 17:18:28 +01:00
John Spray
34160a15ca Support generations in RemoteTimelineClient delete 2023-08-29 17:18:28 +01:00
John Spray
f3a9c2d788 Add optional generation input during create & attach 2023-08-29 17:18:28 +01:00
John Spray
50da1b7983 Simplify Generation 2023-08-29 17:08:55 +01:00
John Spray
4a0e2d1290 Simplify None handling for Generation in LayerfileMetadata 2023-08-29 16:57:28 +01:00
John Spray
980d3ba8b0 clippy 2023-08-29 15:47:00 +01:00
John Spray
fd836d8c45 Support generations in RemoteTimelineClient delete 2023-08-29 15:36:15 +01:00
John Spray
67b17034ab pageserver: use generation in keys when writing 2023-08-29 15:36:15 +01:00
John Spray
930de712ee pageserver: add Generation type to Tenant, Timeline & Index 2023-08-29 15:36:15 +01:00
John Spray
dd033d9138 utils: introduce Generation type 2023-08-29 15:36:12 +01:00
John Spray
5a217791fd libs: give TenantTimelineId a compact string serialization
The existing derive'd Serialize/Deserialize were not used anywhere.

To enable using TenantTimelineId as a key in JSON maps, serialize
as a comma separated string.  This is also a more compact
representation.
2023-08-23 10:33:44 +01:00
John Spray
c9a007d05b deletion queue: future-proof DeletionList format
It needs places to put generation numbers
2023-08-23 10:33:44 +01:00
John Spray
696b49eeba Update deletion list doc comment for Executor 2023-08-23 09:35:10 +01:00
John Spray
206420d96a deletion queue: refactor coalescing into Executor 2023-08-23 09:16:55 +01:00
John Spray
416026381f deletion queue: refactor into frontend/backend modules 2023-08-22 16:38:13 +01:00
John Spray
d9755becab Update RemoteTimelineClient doc comment 2023-08-22 14:36:57 +01:00
John Spray
9cb255be97 Update pageserver/src/deletion_queue.rs
Co-authored-by: Christian Schwarz <christian@neon.tech>
2023-08-22 14:10:11 +01:00
John Spray
57a44dcc01 Update pageserver/src/deletion_queue.rs
Co-authored-by: Christian Schwarz <christian@neon.tech>
2023-08-22 14:10:06 +01:00
John Spray
1afc6337fb Remove unused num_inprogress_deletions 2023-08-22 14:06:15 +01:00
John Spray
74058e196a remote_storage: defensively handle 404 on deletions
S3 implementions are _meant_ to return 200 on deleting
a nonexistent object, but S3 is not a standard and
some implementations have their own ideas.
2023-08-22 13:52:58 +01:00
John Spray
a116f6656f deletion queue: more consistent use of backoff::retry 2023-08-22 13:38:31 +01:00
John Spray
2c7b97245a tweak test_remote_storage_upload_queue_retries 2023-08-22 13:34:12 +01:00
John Spray
6efddbf526 flush tweaks 2023-08-22 13:17:57 +01:00
John Spray
7c4d79f4db deletion queue: cancellable retries 2023-08-22 13:05:04 +01:00
John Spray
8c2ff87f1a libs: make backoff::retry() take a cancellation token 2023-08-22 12:39:19 +01:00
John Spray
23fc247a03 remove redundant spans 2023-08-22 11:22:51 +01:00
John Spray
d8dc4425f8 Merge remote-tracking branch 'upstream/main' into jcsp/deletion-queue 2023-08-22 10:09:23 +01:00
John Spray
18159b7695 deletion queue: expose errors from push/flush 2023-08-22 10:01:10 +01:00
John Spray
c1bc9c0f70 Various test fixes + tweaks to flushing 2023-08-18 12:44:35 +01:00
John Spray
2de5efa208 Fix broken wait_untils in test_remote_storage_upload_queue_retries 2023-08-18 12:44:35 +01:00
John Spray
d330eac4bc clippy 2023-08-18 12:44:35 +01:00
John Spray
3ebceeda71 pageserver: refactor timeline args into TimelineResources
This sidesteps clippy complaining about function arg counts,
and will enable introducing more shared structures in future
without the noise of adding extra args to all the functions
involved in timeline setup.
2023-08-18 12:44:35 +01:00
John Spray
31729d6f4d pageserver: refactor tenant args into a structure
This way, when we add some new shared structure that the
tenants need a reference to, we do not have to add it
individually as an extra argument to the various functions.
2023-08-18 12:44:35 +01:00
John Spray
7e0e3517c1 clippy 2023-08-18 12:44:35 +01:00
John Spray
c4fc6e433d tests: add e2e deletion queue recovery test 2023-08-18 12:44:35 +01:00
John Spray
c36cba28d6 pageserver: generalize flush API 2023-08-18 12:44:35 +01:00
John Spray
8eaa4015de deletion queue: versions in keys 2023-08-18 12:44:35 +01:00
John Spray
10e927ee3e Add encoding versions to deletion queue structs 2023-08-18 12:44:35 +01:00
John Spray
bb3a59f275 clippy 2023-08-18 12:44:35 +01:00
John Spray
a0ed43cc12 deletion queue: add DeletionHeader for sequence numbers 2023-08-18 12:44:35 +01:00
John Spray
99dc5a5c27 Deletion queue: implement recovery on startup 2023-08-18 12:44:35 +01:00
John Spray
54db1f5d8a remote_storage: add a helper for downloading full objects
This is only for use with small objects that we will
deserialize in a non-streaming way.

Also add a strip_prefix method to RemotePath.
2023-08-18 12:44:35 +01:00
John Spray
404b25e45f Remove vestigial remote_timeline_client deletion paths 2023-08-18 12:44:35 +01:00
John Spray
f4dba9f907 tests: update tenant deletion tests for deletion queue 2023-08-18 12:44:35 +01:00
John Spray
4ec45bc7dc tests: update tenant deletion tests for deletion queue 2023-08-18 12:44:35 +01:00
John Spray
a00d4a8d8c tests: update test_remote_timeline_client_calls_started_metric for deletion queue 2023-08-18 12:44:35 +01:00
John Spray
43c9a09d8f tests: update remote storage test for deletion queue 2023-08-18 12:44:35 +01:00
John Spray
3edd7ece40 deletion queue: improve frontend retry 2023-08-18 12:44:35 +01:00
John Spray
504fe9c2b0 pageserver: send timeline deletions through the deletion queue 2023-08-18 12:44:35 +01:00
John Spray
10df237a81 deletion queue: add push for generic objects (layers and garbage) 2023-08-18 12:44:35 +01:00
John Spray
d40f8475a5 Error metric and retries 2023-08-18 12:44:35 +01:00
John Spray
164f916a40 Spawn deletion workers with info spans 2023-08-18 12:44:35 +01:00
John Spray
4ebc29768c Add failpoint for deletion execution 2023-08-18 12:44:35 +01:00
John Spray
bae62916dc pageserver/http: add /v1/deletion_queue/flush_execute
This is principally for tesing, but might be useful in
the field if we want to e.g. flush a deletion queue
before running an external scrub tool
2023-08-18 12:44:35 +01:00
John Spray
5e2b8b376c utils: add ApiError::ShuttingDown
So that handlers that check their CancellationToken
explicitly can map it to a set http status.
2023-08-18 12:44:35 +01:00
John Spray
54ec7919b8 pageserver: add deletion queue submitted/executed metrics 2023-08-18 12:44:35 +01:00
John Spray
e0bed0732c Tweak deletion queue constants 2023-08-18 12:44:35 +01:00
John Spray
9e92121cc3 pageserver: flush deletion queue on clean shutdown 2023-08-18 12:44:35 +01:00
John Spray
50a9508f4f clippy 2023-08-18 12:44:35 +01:00
John Spray
f61402be24 pageserver: testing for deletion queue 2023-08-18 12:44:35 +01:00
John Spray
975e4f2235 Refactor deletion worker construction 2023-08-18 12:44:35 +01:00
John Spray
537eca489e Implement flush_execute() in deletion queue 2023-08-18 12:44:35 +01:00
John Spray
de4882886e pageserver: implement batching in deletion queue 2023-08-18 12:44:35 +01:00
John Spray
6982288426 pageserver: implement frontend of deletion queue 2023-08-18 12:44:35 +01:00
John Spray
ccfcfa1098 remote_storage: implement Serialize/Deserialize for RemotePath 2023-08-18 12:44:35 +01:00
John Spray
e2c793c897 Use deletion queue in schedule_layer_file_deletion 2023-08-18 12:44:33 +01:00
John Spray
0fdc492aa4 Add MockDeletionQueue for unit tests 2023-08-18 11:25:40 +01:00
John Spray
787b099541 wire deletion queue into timeline 2023-08-18 11:25:40 +01:00
John Spray
3af693749d pageserver: wire deletion queue through to Tenant 2023-08-18 11:25:40 +01:00
John Spray
6f9ae6bb5f pageserver: instantiate deletion queue at process scope 2023-08-18 11:25:40 +01:00
John Spray
16d77dcb73 Initial stub implementation of deletion queue 2023-08-18 11:25:40 +01:00
192 changed files with 5845 additions and 11493 deletions

View File

@@ -14,12 +14,10 @@
!pgxn/
!proxy/
!safekeeper/
!s3_scrubber/
!storage_broker/
!trace/
!vendor/postgres-v14/
!vendor/postgres-v15/
!vendor/postgres-v16/
!workspace_hack/
!neon_local/
!scripts/ninstall.sh

View File

@@ -1,8 +0,0 @@
self-hosted-runner:
labels:
- gen3
- large
- small
- us-east-2
config-variables:
- SLACK_UPCOMING_RELEASE_CHANNEL_ID

View File

@@ -70,9 +70,6 @@ runs:
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
prefix: latest
# The lack of compatibility snapshot (for example, for the new Postgres version)
# shouldn't fail the whole job. Only relevant test should fail.
skip-if-does-not-exist: true
- name: Checkout
if: inputs.needs_postgres_source == 'true'

View File

@@ -1,31 +0,0 @@
name: Lint GitHub Workflows
on:
push:
branches:
- main
- release
paths:
- '.github/workflows/*.ya?ml'
pull_request:
paths:
- '.github/workflows/*.ya?ml'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: reviewdog/action-actionlint@v1
env:
# SC2046 - Quote this to prevent word splitting. - https://www.shellcheck.net/wiki/SC2046
# SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086
SHELLCHECK_OPTS: --exclude=SC2046,SC2086
with:
fail_on_error: true
filter_mode: nofilter
level: error

View File

@@ -2,9 +2,7 @@ name: Handle `approved-for-ci-run` label
# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
on:
pull_request_target:
branches:
- main
pull_request:
types:
# Default types that triggers a workflow ([1]):
# - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
@@ -19,83 +17,39 @@ on:
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}"
permissions: write-all
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
jobs:
remove-label:
# Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
# The PR should be reviewed and labelled manually again.
runs-on: [ ubuntu-latest ]
if: |
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
runs-on: ubuntu-latest
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
create-or-update-pr-for-ci-run:
# Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
create-branch:
# Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
runs-on: [ ubuntu-latest ]
if: |
github.event.action == 'labeled' &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
runs-on: ubuntu-latest
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
- uses: actions/checkout@v3
with:
ref: main
token: ${{ secrets.CI_ACCESS_TOKEN }}
- run: gh pr checkout "${PR_NUMBER}"
- run: git checkout -b "${BRANCH}"
- run: git checkout -b "ci-run/pr-${PR_NUMBER}"
- run: git push --force origin "${BRANCH}"
- name: Create a Pull Request for CI run (if required)
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
cat << EOF > body.md
This Pull Request is created automatically to run the CI pipeline for #${PR_NUMBER}
Please do not alter or merge/close it.
Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
EOF
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --base main --json number --jq '.[].number')"
if [ -z "${ALREADY_CREATED}" ]; then
gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
--body-file "body.md" \
--head "${BRANCH}" \
--base "main" \
--draft
fi
cleanup:
# Close PRs and delete branchs if the original PR is closed.
if: |
github.event.action == 'closed' &&
github.event.pull_request.head.repo.full_name != github.repository
runs-on: ubuntu-latest
steps:
- run: |
CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --json 'closed' --jq '.[].closed')"
if [ "${CLOSED}" == "false" ]; then
gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch
fi
- run: git push --force origin "ci-run/pr-${PR_NUMBER}"

View File

@@ -117,7 +117,6 @@ jobs:
outputs:
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }}
steps:
- name: Generate matrix for pgbench benchmark
@@ -137,11 +136,11 @@ jobs:
}'
if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
{ "platform": "rds-aurora", "db_size": "50gb"}]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
- name: Generate matrix for OLAP benchmarks
id: olap-compare-matrix
@@ -153,30 +152,11 @@ jobs:
}'
if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" },
{ "platform": "rds-aurora" }]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
- name: Generate matrix for TPC-H benchmarks
id: tpch-compare-matrix
run: |
matrix='{
"platform": [
"neon-captest-reuse"
],
"scale": [
"10"
]
}'
if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
{ "platform": "rds-aurora", "scale": "10" }]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
pgbench-compare:
needs: [ generate-matrices ]
@@ -253,11 +233,7 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
psql ${CONNSTR} -c "${QUERY}"
psql ${CONNSTR} -c "SELECT version();"
- name: Benchmark init
uses: ./.github/actions/run-python-test-set
@@ -382,11 +358,7 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
psql ${CONNSTR} -c "${QUERY}"
psql ${CONNSTR} -c "SELECT version();"
- name: ClickBench benchmark
uses: ./.github/actions/run-python-test-set
@@ -400,7 +372,6 @@ jobs:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
TEST_OLAP_SCALE: 10
- name: Create Allure report
if: ${{ !cancelled() }}
@@ -427,7 +398,7 @@ jobs:
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }}
matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
@@ -436,7 +407,6 @@ jobs:
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: ${{ matrix.platform }}
TEST_OLAP_SCALE: ${{ matrix.scale }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
@@ -458,17 +428,18 @@ jobs:
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Get Connstring Secret Name
- name: Set up Connection String
id: set-up-connstr
run: |
case "${PLATFORM}" in
neon-captest-reuse)
ENV_PLATFORM=CAPTEST_TPCH
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
;;
rds-aurora)
ENV_PLATFORM=RDS_AURORA_TPCH
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }}
;;
rds-postgres)
ENV_PLATFORM=RDS_AURORA_TPCH
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -476,21 +447,9 @@ jobs:
;;
esac
CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
- name: Set up Connection String
id: set-up-connstr
run: |
CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }}
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
psql ${CONNSTR} -c "${QUERY}"
psql ${CONNSTR} -c "SELECT version();"
- name: Run TPC-H benchmark
uses: ./.github/actions/run-python-test-set
@@ -504,7 +463,6 @@ jobs:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
TEST_OLAP_SCALE: ${{ matrix.scale }}
- name: Create Allure report
if: ${{ !cancelled() }}
@@ -576,11 +534,7 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
psql ${CONNSTR} -c "${QUERY}"
psql ${CONNSTR} -c "SELECT version();"
- name: Run user examples
uses: ./.github/actions/run-python-test-set

View File

@@ -5,6 +5,7 @@ on:
branches:
- main
- release
- ci-run/pr-*
pull_request:
defaults:
@@ -23,30 +24,7 @@ env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
jobs:
check-permissions:
runs-on: ubuntu-latest
steps:
- name: Disallow PRs from forks
if: |
github.event_name == 'pull_request' &&
github.event.pull_request.head.repo.full_name != github.repository
run: |
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
else
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
fi
echo >&2 "We don't run CI for PRs from forks"
echo >&2 "${MESSAGE}"
exit 1
tag:
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
outputs:
@@ -75,7 +53,6 @@ jobs:
id: build-tag
check-codestyle-python:
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -108,7 +85,6 @@ jobs:
run: poetry run mypy .
check-codestyle-rust:
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -175,7 +151,6 @@ jobs:
run: cargo deny check
build-neon:
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -212,7 +187,7 @@ jobs:
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
FAILED=false
for postgres in postgres-v14 postgres-v15 postgres-v16; do
for postgres in postgres-v14 postgres-v15; do
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
actual=$(git rev-parse "HEAD:vendor/${postgres}")
if [ "${expected}" != "${actual}" ]; then
@@ -234,10 +209,6 @@ jobs:
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Set pg 16 revision for caching
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
# Set some environment variables used by all the steps.
#
# CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
@@ -258,12 +229,10 @@ jobs:
cov_prefix=""
CARGO_FLAGS="--locked --release"
fi
{
echo "cov_prefix=${cov_prefix}"
echo "CARGO_FEATURES=${CARGO_FEATURES}"
echo "CARGO_FLAGS=${CARGO_FLAGS}"
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
} >> $GITHUB_ENV
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
# Disabled for now
# Don't include the ~/.cargo/registry/src directory. It contains just
@@ -298,13 +267,6 @@ jobs:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: mold -run make postgres-v14 -j$(nproc)
@@ -313,10 +275,6 @@ jobs:
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: mold -run make postgres-v15 -j$(nproc)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: mold -run make postgres-v16 -j$(nproc)
- name: Build neon extensions
run: mold -run make neon-pg-ext -j$(nproc)
@@ -390,17 +348,17 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
needs: [ check-permissions, build-neon ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
# Default shared memory is 64mb
options: --init --shm-size=512mb
needs: [ build-neon ]
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
pg_version: [ v14, v15, v16 ]
pg_version: [ v14, v15 ]
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -428,12 +386,12 @@ jobs:
uses: ./.github/actions/save-coverage-data
benchmarks:
needs: [ check-permissions, build-neon ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
# Default shared memory is 64mb
options: --init --shm-size=512mb
needs: [ build-neon ]
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
strategy:
fail-fast: false
@@ -460,13 +418,12 @@ jobs:
# while coverage is currently collected for the debug ones
create-test-report:
needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests, benchmarks ]
if: ${{ !cancelled() }}
steps:
- uses: actions/checkout@v3
@@ -492,40 +449,42 @@ jobs:
reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
}
const coverage = {
coverageUrl: "${{ needs.coverage-report.outputs.coverage-html }}",
summaryJsonUrl: "${{ needs.coverage-report.outputs.coverage-json }}",
}
const script = require("./scripts/comment-test-report.js")
await script({
github,
context,
fetch,
report,
coverage,
})
coverage-report:
needs: [ check-permissions, regress-tests ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests ]
strategy:
fail-fast: false
matrix:
build_type: [ debug ]
outputs:
coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
fetch-depth: 1
# Disabled for now
# - name: Restore cargo deps cache
# id: cache_cargo
# uses: actions/cache@v3
# with:
# path: |
# ~/.cargo/registry/
# !~/.cargo/registry/src
# ~/.cargo/git/
# target/
# key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
- name: Get Neon artifact
uses: ./.github/actions/download
@@ -568,45 +527,13 @@ jobs:
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
- name: Build coverage report NEW
id: upload-coverage-report-new
env:
BUCKET: neon-github-public-dev
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
BASELINE="$(git merge-base HEAD origin/main)"
CURRENT="${COMMIT_SHA}"
cp /tmp/coverage/report/lcov.info ./${CURRENT}.info
GENHTML_ARGS="--ignore-errors path,unmapped,empty --synthesize-missing --demangle-cpp rustfilt --output-directory lcov-html ${CURRENT}.info"
# Use differential coverage if the baseline coverage exists.
# It can be missing if the coverage repoer wasn't uploaded yet or tests has failed on BASELINE commit.
if aws s3 cp --only-show-errors s3://${BUCKET}/code-coverage/${BASELINE}/lcov.info ./${BASELINE}.info; then
git diff ${BASELINE} ${CURRENT} -- '*.rs' > baseline-current.diff
GENHTML_ARGS="--baseline-file ${BASELINE}.info --diff-file baseline-current.diff ${GENHTML_ARGS}"
fi
genhtml ${GENHTML_ARGS}
aws s3 cp --only-show-errors --recursive ./lcov-html/ s3://${BUCKET}/code-coverage/${COMMIT_SHA}/lcov
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/index.html
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
- uses: actions/github-script@v6
env:
REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
script: |
const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env
const { REPORT_URL, COMMIT_SHA } = process.env
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
@@ -617,21 +544,12 @@ jobs:
context: 'Code coverage report',
})
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: `${COMMIT_SHA}`,
state: 'success',
target_url: `${REPORT_URL_NEW}`,
context: 'Code coverage report NEW',
})
trigger-e2e-tests:
needs: [ check-permissions, promote-images, tag ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
needs: [ promote-images, tag ]
steps:
- name: Set PR's status to pending and request a remote CI test
run: |
@@ -672,8 +590,8 @@ jobs:
}"
neon-image:
needs: [ check-permissions, tag ]
runs-on: [ self-hosted, gen3, large ]
needs: [ tag ]
container: gcr.io/kaniko-project/executor:v1.9.2-debug
defaults:
run:
@@ -720,7 +638,7 @@ jobs:
compute-tools-image:
runs-on: [ self-hosted, gen3, large ]
needs: [ check-permissions, tag ]
needs: [ tag ]
container: gcr.io/kaniko-project/executor:v1.9.2-debug
defaults:
run:
@@ -765,17 +683,17 @@ jobs:
run: rm -rf ~/.ecr
compute-node-image:
needs: [ check-permissions, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: gcr.io/kaniko-project/executor:v1.9.2-debug
# Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
# Should be prevented by https://github.com/neondatabase/neon/issues/4281
options: --add-host=download.osgeo.org:140.211.15.30
needs: [ tag ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
version: [ v14, v15 ]
defaults:
run:
shell: sh -eu {0}
@@ -824,12 +742,12 @@ jobs:
run: rm -rf ~/.ecr
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
runs-on: [ self-hosted, gen3, large ]
needs: [ tag, compute-node-image ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16 ]
version: [ v14, v15 ]
defaults:
run:
shell: sh -eu {0}
@@ -866,7 +784,7 @@ jobs:
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
runs-on: [ self-hosted, gen3, small ]
steps:
@@ -909,8 +827,8 @@ jobs:
docker compose -f ./docker-compose/docker-compose.yml down
promote-images:
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
runs-on: [ self-hosted, gen3, small ]
needs: [ tag, test-images, vm-compute-node-image ]
container: golang:1.19-bullseye
# Don't add if-condition here.
# The job should always be run because we have dependant other jobs that shouldn't be skipped
@@ -930,7 +848,6 @@ jobs:
run: |
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
- name: Add latest tag to images
if: |
@@ -943,8 +860,6 @@ jobs:
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
- name: Push images to production ECR
if: |
@@ -957,8 +872,6 @@ jobs:
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
- name: Configure Docker Hub login
run: |
@@ -970,7 +883,6 @@ jobs:
run: |
crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
- name: Push latest tags to Docker Hub
if: |
@@ -983,19 +895,21 @@ jobs:
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, tag ]
runs-on: ubuntu-latest
build-private-extensions:
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
needs: [ tag ]
steps:
- name: Set PR's status to pending and request a remote CI test
run: |
COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }}
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"
curl -f -X POST \
@@ -1025,50 +939,11 @@ jobs:
}
}"
- name: Wait for extension build to finish
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
INTERVAL=15 # try each N seconds
last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context
for ((i=0; i <= TIMEOUT; i+=INTERVAL)); do
sleep $INTERVAL
# Get statuses for the latest commit in the PR / branch
gh api \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json
# Get the latest status for the "build-and-upload-extensions" context
last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json)
if [ "${last_status}" = "pending" ]; then
# Extension build is still in progress.
continue
elif [ "${last_status}" = "success" ]; then
# Extension build is successful.
exit 0
else
# Status is neither "pending" nor "success", exit the loop and fail the job.
break
fi
done
# Extension build failed, print `statuses.json` for debugging and fail the job.
jq '.' statuses.json
echo >&2 "Status of extension build is '${last_status}' != 'success'"
exit 1
deploy:
needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
needs: [ promote-images, tag, regress-tests ]
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
steps:
- name: Fix git ownership
run: |
@@ -1106,35 +981,20 @@ jobs:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
await github.rest.git.createRef({
github.rest.git.createRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
sha: context.sha,
})
- name: Create GitHub release
if: github.ref_name == 'release'
uses: actions/github-script@v6
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
await github.rest.repos.createRelease({
owner: context.repo.owner,
repo: context.repo.repo,
tag_name: "${{ needs.tag.outputs.build-tag }}",
generate_release_notes: true,
})
promote-compatibility-data:
needs: [ check-permissions, promote-images, tag, regress-tests ]
if: github.ref_name == 'release'
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
needs: [ promote-images, tag, regress-tests ]
if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
steps:
- name: Promote compatibility snapshot for the release
env:
@@ -1142,7 +1002,7 @@ jobs:
PREFIX: artifacts/latest
run: |
# Update compatibility snapshot for the release
for pg_version in v14 v15 v16; do
for pg_version in v14 v15; do
for build_type in debug release; do
OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst

View File

@@ -4,6 +4,7 @@ on:
push:
branches:
- main
- ci-run/pr-*
pull_request:
defaults:
@@ -38,7 +39,7 @@ jobs:
fetch-depth: 1
- name: Install macOS postgres dependencies
run: brew install flex bison openssl protobuf icu4c pkg-config
run: brew install flex bison openssl protobuf
- name: Set pg 14 revision for caching
id: pg_v14_rev
@@ -48,10 +49,6 @@ jobs:
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Set pg 16 revision for caching
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v3
@@ -66,13 +63,6 @@ jobs:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Set extra env for macOS
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
@@ -96,10 +86,6 @@ jobs:
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: make postgres-v15 -j$(nproc)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: make postgres-v16 -j$(nproc)
- name: Build neon extensions
run: make neon-pg-ext -j$(nproc)

View File

@@ -1,29 +0,0 @@
name: Notify Slack channel about upcoming release
concurrency:
group: ${{ github.workflow }}-${{ github.event.number }}
cancel-in-progress: true
on:
pull_request:
branches:
- release
types:
# Default types that triggers a workflow:
# - https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
- opened
- synchronize
- reopened
# Additional types that we want to handle:
- closed
jobs:
notify:
runs-on: [ ubuntu-latest ]
steps:
- uses: neondatabase/dev-actions/release-pr-notify@main
with:
slack-token: ${{ secrets.SLACK_BOT_TOKEN }}
slack-channel-id: ${{ vars.SLACK_UPCOMING_RELEASE_CHANNEL_ID || 'C05QQ9J1BRC' }} # if not set, then `#test-release-notifications`
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -2,19 +2,16 @@ name: Create Release Branch
on:
schedule:
- cron: '0 7 * * 2'
- cron: '0 10 * * 2'
workflow_dispatch:
jobs:
create_release_branch:
runs-on: [ ubuntu-latest ]
permissions:
contents: write # for `git push`
runs-on: [ubuntu-latest]
steps:
- name: Check out code
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
ref: main
@@ -29,16 +26,9 @@ jobs:
run: git push origin releases/${{ steps.date.outputs.date }}
- name: Create pull request into release
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
cat << EOF > body.md
## Release ${{ steps.date.outputs.date }}
**Please merge this PR using 'Create a merge commit'!**
EOF
gh pr create --title "Release ${{ steps.date.outputs.date }}" \
--body-file "body.md" \
--head "releases/${{ steps.date.outputs.date }}" \
--base "release"
uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
head: releases/${{ steps.date.outputs.date }}
base: release
title: Release ${{ steps.date.outputs.date }}

4
.gitmodules vendored
View File

@@ -6,7 +6,3 @@
path = vendor/postgres-v15
url = https://github.com/neondatabase/postgres.git
branch = REL_15_STABLE_neon
[submodule "vendor/postgres-v16"]
path = vendor/postgres-v16
url = https://github.com/neondatabase/postgres.git
branch = REL_16_STABLE_neon

View File

@@ -27,28 +27,3 @@ your patch's fault. Help to fix the root cause if something else has
broken the CI, before pushing.
*Happy Hacking!*
# How to run a CI pipeline on Pull Requests from external contributors
_An instruction for maintainers_
## TL;DR:
- Review the PR
- If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then:
- Press the "Approve and run" button in GitHub UI
- Add the `approved-for-ci-run` label to the PR
Repeat all steps after any change to the PR.
- When the changes are ready to get merged — merge the original PR (not the internal one)
## Longer version:
GitHub Actions triggered by the `pull_request` event don't share repository secrets with the forks (for security reasons).
So, passing the CI pipeline on Pull Requests from external contributors is impossible.
We're using the following approach to make it work:
- After the review, assign the `approved-for-ci-run` label to the PR if changes look safe
- A GitHub Action will create an internal branch and a new PR with the same changes (for example, for a PR `#1234`, it'll be a branch `ci-run/pr-1234`)
- Because the PR is created from the internal branch, it is able to access repository secrets (that's why it's crucial to make sure that the PR doesn't contain any malicious code that could expose our secrets or intentionally harm the CI)
- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)

499
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,4 @@
[workspace]
resolver = "2"
members = [
"compute_tools",
"control_plane",
@@ -8,7 +7,6 @@ members = [
"proxy",
"safekeeper",
"storage_broker",
"s3_scrubber",
"workspace_hack",
"trace",
"libs/compute_api",
@@ -39,11 +37,11 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
aws-sdk-s3 = "0.29"
aws-smithy-http = "0.56"
aws-credential-types = "0.56"
aws-types = "0.56"
aws-config = { version = "0.55", default-features = false, features=["rustls"] }
aws-sdk-s3 = "0.27"
aws-smithy-http = "0.55"
aws-credential-types = "0.55"
aws-types = "0.55"
axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
@@ -107,12 +105,12 @@ reqwest-middleware = "0.2.0"
reqwest-retry = "0.2.2"
routerify = "3"
rpds = "0.13"
rustls = "0.21"
rustls = "0.20"
rustls-pemfile = "1"
rustls-split = "0.3"
scopeguard = "1.1"
sysinfo = "0.29.2"
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_with = "2.0"
@@ -127,11 +125,11 @@ sync_wrapper = "0.1.2"
tar = "0.4"
test-context = "0.1"
thiserror = "1.0"
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
tokio = { version = "1.17", features = ["macros"] }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.10.0"
tokio-rustls = "0.24"
tokio-postgres-rustls = "0.9.0"
tokio-rustls = "0.23"
tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7", features = ["io"] }
@@ -145,7 +143,7 @@ tracing-subscriber = { version = "0.3", default_features = false, features = ["s
url = "2.2"
uuid = { version = "1.2", features = ["v4", "serde"] }
walkdir = "2.3.2"
webpki-roots = "0.25"
webpki-roots = "0.23"
x509-parser = "0.15"
## TODO replace this with tracing
@@ -184,8 +182,8 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
## Build dependencies
criterion = "0.5.1"
rcgen = "0.11"
rstest = "0.18"
rcgen = "0.10"
rstest = "0.17"
tempfile = "3.4"
tonic-build = "0.9"

View File

@@ -12,7 +12,6 @@ WORKDIR /home/nonroot
COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
COPY --chown=nonroot pgxn pgxn
COPY --chown=nonroot Makefile Makefile
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -40,7 +39,6 @@ ARG CACHEPOT_BUCKET=neon-github-dev
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
COPY --chown=nonroot . .
# Show build caching stats to check if it was used in the end.
@@ -67,7 +65,6 @@ RUN set -e \
&& apt install -y \
libreadline-dev \
libseccomp-dev \
libicu67 \
openssl \
ca-certificates \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
@@ -84,7 +81,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.

View File

@@ -74,8 +74,8 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
./autogen.sh && \
@@ -124,8 +124,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y ninja-build python3-dev libncurses5 binutils clang
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.8.tar.gz -O plv8.tar.gz && \
echo "92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 plv8.tar.gz" | sha256sum --check && \
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
echo "1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 plv8.tar.gz" | sha256sum --check && \
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -172,8 +172,8 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
cp -R /h3/usr / && \
rm -rf build
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
echo "c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 h3-pg.tar.gz" | sha256sum --check && \
mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -243,8 +243,8 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
FROM build-deps AS hypopg-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
echo "e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e hypopg.tar.gz" | sha256sum --check && \
mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -307,8 +307,8 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
FROM build-deps AS ip4r-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \
echo "78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 ip4r.tar.gz" | sha256sum --check && \
mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -323,8 +323,8 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
FROM build-deps AS prefix-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \
echo "38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e prefix.tar.gz" | sha256sum --check && \
mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -339,8 +339,8 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
FROM build-deps AS hll-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \
echo "9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 hll.tar.gz" | sha256sum --check && \
mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -355,8 +355,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
FROM build-deps AS plpgsql-check-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \
echo "9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b plpgsql_check.tar.gz" | sha256sum --check && \
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -371,21 +371,12 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz
FROM build-deps AS timescaledb-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ARG PG_VERSION
ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export TIMESCALEDB_VERSION=2.10.1 \
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
;; \
*) \
echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
esac && \
apt-get update && \
RUN apt-get update && \
apt-get install -y cmake && \
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
cd build && \
@@ -414,10 +405,6 @@ RUN case "${PG_VERSION}" in \
export PG_HINT_PLAN_VERSION=15_1_5_0 \
export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \
;; \
"v16") \
export PG_HINT_PLAN_VERSION=16_1_6_0 \
export PG_HINT_PLAN_CHECKSUM=ce6a8040c78012000f5da7240caf6a971401412f41d33f930f09291e6c304b99 \
;; \
*) \
echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
;; \
@@ -465,8 +452,8 @@ FROM build-deps AS pg-cron-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -492,8 +479,8 @@ RUN apt-get update && \
libfreetype6-dev
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
cmake \
-D RDK_BUILD_CAIRO_SUPPORT=OFF \
@@ -564,16 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export PG_EMBEDDING_VERSION=0.3.6 \
export PG_EMBEDDING_CHECKSUM=b2e2b359335d26987778c7fae0c9bcc8ebc3530fc214113be1ddbc8a136e52ac \
;; \
*) \
echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
esac && \
wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -605,10 +584,6 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
# Layer "rust extensions"
# This layer is used to build `pgx` deps
#
# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
# dependency on all the rust extension that depend on it, too.
#
#########################################################################################
FROM build-deps AS rust-extensions-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -623,17 +598,7 @@ USER nonroot
WORKDIR /home/nonroot
ARG PG_VERSION
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
;; \
esac && \
curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \
@@ -650,21 +615,10 @@ USER root
#########################################################################################
FROM rust-extensions-build AS pg-jsonschema-pg-build
ARG PG_VERSION
# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
;; \
esac && \
wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -679,23 +633,12 @@ RUN case "${PG_VERSION}" in \
#########################################################################################
FROM rust-extensions-build AS pg-graphql-pg-build
ARG PG_VERSION
# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
# Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in
# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
# same 1.1 version we've used before.
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -713,20 +656,9 @@ RUN case "${PG_VERSION}" in \
#########################################################################################
FROM rust-extensions-build AS pg-tiktoken-pg-build
ARG PG_VERSION
# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
cargo pgx install --release && \
@@ -740,19 +672,8 @@ RUN case "${PG_VERSION}" in \
#########################################################################################
FROM rust-extensions-build AS pg-pgx-ulid-build
ARG PG_VERSION
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -805,20 +726,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_rmgr \
-s install && \
case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/hnsw \

View File

@@ -29,7 +29,6 @@ else ifeq ($(UNAME_S),Darwin)
# It can be configured with OPENSSL_PREFIX variable
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
@@ -84,8 +83,6 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
# I'm not sure why it wouldn't work, but this is the only place (apart from
# the "build-all-versions" entry points) where direct mention of PostgreSQL
# versions is used.
.PHONY: postgres-configure-v16
postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
.PHONY: postgres-configure-v15
postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
.PHONY: postgres-configure-v14
@@ -121,10 +118,6 @@ postgres-clean-%:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean
.PHONY: postgres-check-%
postgres-check-%: postgres-%
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check
.PHONY: neon-pg-ext-%
neon-pg-ext-%: postgres-%
+@echo "Compiling neon $*"
@@ -137,11 +130,6 @@ neon-pg-ext-%: postgres-%
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+@echo "Compiling neon_rmgr $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install
+@echo "Compiling neon_test_utils $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
@@ -152,13 +140,6 @@ neon-pg-ext-%: postgres-%
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
# pg_embedding was temporarily released as hnsw from this repo, when we only
# supported PostgreSQL 14 and 15
neon-pg-ext-v14: neon-pg-ext-hnsw-v14
neon-pg-ext-v15: neon-pg-ext-hnsw-v15
neon-pg-ext-hnsw-%: postgres-headers-% postgres-%
+@echo "Compiling hnsw $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
@@ -186,39 +167,28 @@ neon-pg-ext-clean-%:
.PHONY: neon-pg-ext
neon-pg-ext: \
neon-pg-ext-v14 \
neon-pg-ext-v15 \
neon-pg-ext-v16
neon-pg-ext-v15
.PHONY: neon-pg-ext-clean
neon-pg-ext-clean: \
neon-pg-ext-clean-v14 \
neon-pg-ext-clean-v15 \
neon-pg-ext-clean-v16
neon-pg-ext-clean-v15
# shorthand to build all Postgres versions
.PHONY: postgres
postgres: \
postgres-v14 \
postgres-v15 \
postgres-v16
postgres-v15
.PHONY: postgres-headers
postgres-headers: \
postgres-headers-v14 \
postgres-headers-v15 \
postgres-headers-v16
postgres-headers-v15
.PHONY: postgres-clean
postgres-clean: \
postgres-clean-v14 \
postgres-clean-v15 \
postgres-clean-v16
.PHONY: postgres-check
postgres-check: \
postgres-check-v14 \
postgres-check-v15 \
postgres-check-v16
postgres-clean-v15
# This doesn't remove the effects of 'configure'.
.PHONY: clean

View File

@@ -29,18 +29,18 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
```bash
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
libcurl4-openssl-dev openssl python-poetry lsof
libcurl4-openssl-dev openssl python-poetry
```
* On Fedora, these packages are needed:
```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel libcurl-devel openssl poetry lsof
protobuf-devel libcurl-devel openssl poetry
```
* On Arch based systems, these packages are needed:
```bash
pacman -S base-devel readline zlib libseccomp openssl clang \
postgresql-libs cmake postgresql protobuf curl lsof
postgresql-libs cmake postgresql protobuf curl
```
Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
@@ -55,7 +55,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
1. Install XCode and dependencies
```
xcode-select --install
brew install protobuf openssl flex bison icu4c pkg-config
brew install protobuf openssl flex bison
# add openssl to PATH, required for ed25519 keys generation in neon_local
echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc

View File

@@ -1,5 +0,0 @@
disallowed-methods = [
"tokio::task::block_in_place",
# Allow this for now, to deny it later once we stop using Handle::block_on completely
# "tokio::runtime::Handle::block_on",
]

View File

@@ -1,39 +1,12 @@
use anyhow::{anyhow, Ok, Result};
use postgres::Client;
use anyhow::{anyhow, Result};
use tokio_postgres::NoTls;
use tracing::{error, instrument, warn};
use tracing::{error, instrument};
use crate::compute::ComputeNode;
/// Create a special service table for availability checks
/// only if it does not exist already.
pub fn create_availability_check_data(client: &mut Client) -> Result<()> {
let query = "
DO $$
BEGIN
IF NOT EXISTS(
SELECT 1
FROM pg_catalog.pg_tables
WHERE tablename = 'health_check'
)
THEN
CREATE TABLE health_check (
id serial primary key,
updated_at timestamptz default now()
);
INSERT INTO health_check VALUES (1, now())
ON CONFLICT (id) DO UPDATE
SET updated_at = now();
END IF;
END
$$;";
client.execute(query, &[])?;
Ok(())
}
/// Update timestamp in a row in a special service table to check
/// that we can actually write some data in this particular timeline.
/// Create table if it's missing.
#[instrument(skip_all)]
pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
// Connect to the database.
@@ -51,28 +24,21 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
});
let query = "
CREATE TABLE IF NOT EXISTS health_check (
id serial primary key,
updated_at timestamptz default now()
);
INSERT INTO health_check VALUES (1, now())
ON CONFLICT (id) DO UPDATE
SET updated_at = now();";
match client.simple_query(query).await {
Result::Ok(result) => {
if result.len() != 1 {
return Err(anyhow::anyhow!(
"expected 1 query results, but got {}",
result.len()
));
}
}
Err(err) => {
if let Some(state) = err.code() {
if state == &tokio_postgres::error::SqlState::DISK_FULL {
warn!("Tenant disk is full");
return Ok(());
}
}
return Err(err.into());
}
let result = client.simple_query(query).await?;
if result.len() != 2 {
return Err(anyhow::format_err!(
"expected 2 query results, but got {}",
result.len()
));
}
Ok(())

View File

@@ -27,7 +27,6 @@ use utils::measured_stream::MeasuredReader;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use crate::checker::create_availability_check_data;
use crate::pg_helpers::*;
use crate::spec::*;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -697,7 +696,6 @@ impl ComputeNode {
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
handle_grants(spec, self.connstr.as_str())?;
handle_extensions(spec, &mut client)?;
create_availability_check_data(&mut client)?;
// 'Close' connection
drop(client);
@@ -1080,8 +1078,7 @@ LIMIT 100",
let mut download_tasks = Vec::new();
for library in &libs_vec {
let (ext_name, ext_path) =
remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
download_tasks.push(self.download_extension(ext_name, ext_path));
}
let results = join_all(download_tasks).await;

View File

@@ -46,6 +46,8 @@ pub fn write_postgres_conf(
writeln!(file, "{}", conf)?;
}
write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
// Add options for connecting to storage
writeln!(file, "# Neon storage settings")?;
if let Some(s) = &spec.pageserver_connstring {

View File

@@ -74,7 +74,6 @@ More specifically, here is an example ext_index.json
use anyhow::Context;
use anyhow::{self, Result};
use compute_api::spec::RemoteExtSpec;
use regex::Regex;
use remote_storage::*;
use serde_json;
use std::io::Read;
@@ -107,71 +106,16 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {
pub fn get_pg_version(pgbin: &str) -> String {
// pg_config --version returns a (platform specific) human readable string
// such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
// such as "PostgreSQL 15.4". We parse this to v14/v15
let human_version = get_pg_config("--version", pgbin);
return parse_pg_version(&human_version).to_string();
}
fn parse_pg_version(human_version: &str) -> &str {
// Normal releases have version strings like "PostgreSQL 15.4". But there
// are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
// 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
// configure option, you can tack any string to the version number,
// e.g. "PostgreSQL 15.4foobar".
match Regex::new(r"^PostgreSQL (?<major>\d+).+")
.unwrap()
.captures(human_version)
{
Some(captures) if captures.len() == 2 => match &captures["major"] {
"14" => return "v14",
"15" => return "v15",
"16" => return "v16",
_ => {}
},
_ => {}
if human_version.contains("15") {
return "v15".to_string();
} else if human_version.contains("14") {
return "v14".to_string();
}
panic!("Unsuported postgres version {human_version}");
}
#[cfg(test)]
mod tests {
use super::parse_pg_version;
#[test]
fn test_parse_pg_version() {
assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
assert_eq!(
parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
"v15"
);
assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
assert_eq!(
parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
"v14"
);
assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
}
#[test]
#[should_panic]
fn test_parse_pg_unsupported_version() {
parse_pg_version("PostgreSQL 13.14");
}
#[test]
#[should_panic]
fn test_parse_pg_incorrect_version_format() {
parse_pg_version("PostgreSQL 14");
}
}
// download the archive for a given extension,
// unzip it, and place files in the appropriate locations (share/lib)
pub async fn download_extension(
@@ -236,19 +180,7 @@ pub async fn download_extension(
// Create extension control files from spec
pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
for (ext_name, ext_data) in remote_extensions.extension_data.iter() {
// Check if extension is present in public or custom.
// If not, then it is not allowed to be used by this compute.
if let Some(public_extensions) = &remote_extensions.public_extensions {
if !public_extensions.contains(ext_name) {
if let Some(custom_extensions) = &remote_extensions.custom_extensions {
if !custom_extensions.contains(ext_name) {
continue; // skip this extension, it is not allowed
}
}
}
}
for ext_data in remote_extensions.extension_data.values() {
for (control_name, control_content) in &ext_data.control_data {
let control_path = local_sharedir.join(control_name);
if !control_path.exists() {

View File

@@ -1,6 +1,4 @@
use std::convert::Infallible;
use std::net::IpAddr;
use std::net::Ipv6Addr;
use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
@@ -171,12 +169,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
};
remote_extensions.get_ext(
&filename,
is_library,
&compute.build_tag,
&compute.pgversion,
)
remote_extensions.get_ext(&filename, is_library)
};
match ext {
@@ -300,9 +293,7 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(port: u16, state: Arc<ComputeNode>) {
// this usually binds to both IPv4 and IPv6 on linux
// see e.g. https://github.com/rust-lang/rust/pull/34440
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
let addr = SocketAddr::from(([0, 0, 0, 0], port));
let make_service = make_service_fn(move |_conn| {
let state = state.clone();

View File

@@ -6,4 +6,4 @@ pub const DEFAULT_LOG_LEVEL: &str = "info";
// https://www.postgresql.org/docs/15/auth-password.html
//
// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\tall\t\tmd5";
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";

View File

@@ -12,7 +12,6 @@ git-version.workspace = true
nix.workspace = true
once_cell.workspace = true
postgres.workspace = true
hex.workspace = true
hyper.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["blocking", "json"] }

View File

@@ -1,7 +1,6 @@
# Minimal neon environment with one safekeeper. This is equivalent to the built-in
# defaults that you get with no --config
[[pageservers]]
id=1
[pageserver]
listen_pg_addr = '127.0.0.1:64000'
listen_http_addr = '127.0.0.1:9898'
pg_auth_type = 'Trust'

View File

@@ -1,7 +1,7 @@
use crate::{background_process, local_env::LocalEnv};
use anyhow::anyhow;
use pageserver_api::control_api::HexTenantId;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::{path::PathBuf, process::Child};
use utils::id::{NodeId, TenantId};
@@ -13,11 +13,9 @@ pub struct AttachmentService {
const COMMAND: &str = "attachment_service";
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
pub tenant_id: HexTenantId,
pub pageserver_id: Option<NodeId>,
}
@@ -32,7 +30,7 @@ impl AttachmentService {
// Makes no sense to construct this if pageservers aren't going to use it: assume
// pageservers have control plane API set
let listen_url = env.control_plane_api.clone().unwrap();
let listen_url = env.pageserver.control_plane_api.clone().unwrap();
let listen = format!(
"{}:{}",
@@ -80,6 +78,7 @@ impl AttachmentService {
let url = self
.env
.pageserver
.control_plane_api
.clone()
.unwrap()
@@ -90,13 +89,13 @@ impl AttachmentService {
.expect("Failed to construct http client");
let request = AttachHookRequest {
tenant_id,
tenant_id: HexTenantId::new(tenant_id),
pageserver_id: Some(pageserver_id),
};
let response = client.post(url).json(&request).send()?;
if response.status() != StatusCode::OK {
return Err(anyhow!("Unexpected status {}", response.status()));
return Err(anyhow!("Unexpected status {0}", response.status()));
}
let response = response.json::<AttachHookResponse>()?;

View File

@@ -6,9 +6,9 @@
///
use anyhow::anyhow;
use clap::Parser;
use hex::FromHex;
use hyper::StatusCode;
use hyper::{Body, Request, Response};
use pageserver_api::control_api::*;
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use std::{collections::HashMap, sync::Arc};
@@ -25,22 +25,15 @@ use utils::{
tcp_listener,
};
use pageserver_api::control_api::{
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
ValidateResponseTenant,
};
use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
#[command(arg_required_else_help(true))]
struct Cli {
/// Host and port to listen on, like `127.0.0.1:1234`
#[arg(short, long)]
listen: std::net::SocketAddr,
listen: String,
/// Path to the .json file to store state (will be created if it doesn't exist)
#[arg(short, long)]
path: PathBuf,
}
@@ -61,10 +54,13 @@ where
S: serde::Serializer,
V: Clone + Serialize,
{
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
eprintln!("to_hex_map");
let transformed = input
.iter()
.map(|(k, v)| (HexTenantId::new(k.clone()), v.clone()));
transformed
.collect::<HashMap<String, V>>()
.collect::<HashMap<HexTenantId, V>>()
.serialize(serializer)
}
@@ -73,15 +69,10 @@ where
D: serde::de::Deserializer<'de>,
V: Deserialize<'de>,
{
let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
hex_map
.into_iter()
.map(|(k, v)| {
TenantId::from_hex(k)
.map(|k| (k, v))
.map_err(serde::de::Error::custom)
})
.collect()
eprintln!("from_hex_map");
let hex_map = HashMap::<HexTenantId, V>::deserialize(deserializer)?;
Ok(hex_map.into_iter().map(|(k, v)| (k.take(), v)).collect())
}
// Top level state available to all HTTP handlers
@@ -111,24 +102,17 @@ impl PersistentState {
async fn load_or_new(path: &Path) -> Self {
match Self::load(path).await {
Ok(s) => {
tracing::info!("Loaded state file at {}", path.display());
s
}
Err(e)
if e.downcast_ref::<std::io::Error>()
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
.unwrap_or(false) =>
{
tracing::info!("Will create state file at {}", path.display());
Ok(s) => s,
Err(e) => {
tracing::info!(
"Creating new state file at {0} (load returned {e})",
path.to_string_lossy()
);
Self {
tenants: HashMap::new(),
path: path.to_owned(),
}
}
Err(e) => {
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
}
}
}
}
@@ -169,13 +153,16 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
if state.pageserver == Some(reattach_req.node_id) {
state.generation += 1;
response.tenants.push(ReAttachResponseTenant {
id: *t,
id: HexTenantId::new(t.clone()),
generation: state.generation,
});
}
}
locked.save().await.map_err(ApiError::InternalServerError)?;
locked
.save()
.await
.map_err(|e| ApiError::InternalServerError(e))?;
json_response(StatusCode::OK, response)
}
@@ -185,14 +172,15 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
let locked = get_state(&req).inner.read().await;
let state = get_state(&req).inner.clone();
let locked = state.read().await;
let mut response = ValidateResponse {
tenants: Vec::new(),
};
for req_tenant in validate_req.tenants {
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
if let Some(tenant_state) = locked.tenants.get(req_tenant.id.as_ref()) {
let valid = tenant_state.generation == req_tenant.gen;
response.tenants.push(ValidateResponseTenant {
id: req_tenant.id,
@@ -214,7 +202,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
let tenant_state = locked
.tenants
.entry(attach_req.tenant_id)
.entry(attach_req.tenant_id.take())
.or_insert_with(|| TenantState {
pageserver: attach_req.pageserver_id,
generation: 0,
@@ -225,7 +213,10 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
}
let generation = tenant_state.generation;
locked.save().await.map_err(ApiError::InternalServerError)?;
locked
.save()
.await
.map_err(|e| ApiError::InternalServerError(e))?;
json_response(
StatusCode::OK,
@@ -238,9 +229,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
endpoint::make_router()
.data(Arc::new(State::new(persistent_state)))
.post("/re-attach", handle_re_attach)
.post("/validate", handle_validate)
.post("/attach_hook", handle_attach_hook)
.post("/re-attach", |r| handle_re_attach(r))
.post("/validate", |r| handle_validate(r))
.post("/attach_hook", |r| handle_attach_hook(r))
}
#[tokio::main]
@@ -259,14 +250,14 @@ async fn main() -> anyhow::Result<()> {
let persistent_state = PersistentState::load_or_new(&args.path).await;
let http_listener = tcp_listener::bind(args.listen)?;
let http_listener = tcp_listener::bind(&args.listen)?;
let router = make_router(persistent_state)
.build()
.map_err(|err| anyhow!(err))?;
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
tracing::info!("Serving on {0}", args.listen);
tracing::info!("Serving on {0}", args.listen.as_str());
server.await?;
Ok(())

View File

@@ -50,17 +50,16 @@ fn default_conf() -> String {
format!(
r#"
# Default built-in configuration, defined in main.rs
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
[broker]
listen_addr = '{DEFAULT_BROKER_ADDR}'
[[pageservers]]
[pageserver]
id = {DEFAULT_PAGESERVER_ID}
listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
pg_auth_type = '{trust_auth}'
http_auth_type = '{trust_auth}'
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
[[safekeepers]]
id = {DEFAULT_SAFEKEEPER_ID}
@@ -259,7 +258,7 @@ fn get_timeline_infos(
env: &local_env::LocalEnv,
tenant_id: &TenantId,
) -> Result<HashMap<TimelineId, TimelineInfo>> {
Ok(get_default_pageserver(env)
Ok(PageServerNode::from_env(env)
.timeline_list(tenant_id)?
.into_iter()
.map(|timeline_info| (timeline_info.timeline_id, timeline_info))
@@ -320,30 +319,17 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
.context("Failed to initialize neon repository")?;
// Initialize pageserver, create initial tenant and timeline.
for ps_conf in &env.pageservers {
PageServerNode::from_env(&env, ps_conf)
.initialize(&pageserver_config_overrides(init_match))
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {e:?}");
exit(1);
});
}
let pageserver = PageServerNode::from_env(&env);
pageserver
.initialize(&pageserver_config_overrides(init_match))
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {e:?}");
exit(1);
});
Ok(env)
}
/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
/// For typical interactive use, one would just run with a single pageserver. Scenarios with
/// tenant/timeline placement across multiple pageservers are managed by python test code rather
/// than this CLI.
fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
let ps_conf = env
.pageservers
.first()
.expect("Config is validated to contain at least one pageserver");
PageServerNode::from_env(env, ps_conf)
}
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
init_match
.get_many::<String>("pageserver-config-override")
@@ -354,7 +340,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
}
fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
let pageserver = get_default_pageserver(env);
let pageserver = PageServerNode::from_env(env);
match tenant_match.subcommand() {
Some(("list", _)) => {
for t in pageserver.tenant_list()? {
@@ -368,13 +354,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
.unwrap_or_default();
// If tenant ID was not specified, generate one
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
let tenant_id = parse_tenant_id(create_match)?.unwrap_or(TenantId::generate());
let generation = if env.control_plane_api.is_some() {
let generation = if env.pageserver.control_plane_api.is_some() {
// We must register the tenant with the attachment service, so
// that when the pageserver restarts, it will be re-attached.
let attachment_service = AttachmentService::from_env(env);
attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
attachment_service.attach_hook(tenant_id, env.pageserver.id)?
} else {
None
};
@@ -439,7 +425,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
}
fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
let pageserver = get_default_pageserver(env);
let pageserver = PageServerNode::from_env(env);
match timeline_match.subcommand() {
Some(("list", list_match)) => {
@@ -516,7 +502,6 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
None,
pg_version,
ComputeMode::Primary,
DEFAULT_PAGESERVER_ID,
)?;
println!("Done");
}
@@ -570,6 +555,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
Some(ep_subcommand_data) => ep_subcommand_data,
None => bail!("no endpoint subcommand provided"),
};
let mut cplane = ComputeControlPlane::load(env.clone())?;
// All subcommands take an optional --tenant-id option
@@ -666,13 +652,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
.copied()
.unwrap_or(false);
let pageserver_id =
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
NodeId(id_str.parse().context("while parsing pageserver id")?)
} else {
DEFAULT_PAGESERVER_ID
};
let mode = match (lsn, hot_standby) {
(Some(lsn), false) => ComputeMode::Static(lsn),
(None, true) => ComputeMode::Replica,
@@ -688,7 +667,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
http_port,
pg_version,
mode,
pageserver_id,
)?;
}
"start" => {
@@ -698,13 +676,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
.get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
let pageserver_id =
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
NodeId(id_str.parse().context("while parsing pageserver id")?)
} else {
DEFAULT_PAGESERVER_ID
};
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
// If --safekeepers argument is given, use only the listed safekeeper nodes.
@@ -724,8 +695,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
let endpoint = cplane.endpoints.get(endpoint_id.as_str());
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
Some(env.generate_auth_token(&claims)?)
@@ -792,7 +762,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
http_port,
pg_version,
mode,
pageserver_id,
)?;
ep.start(&auth_token, safekeepers, remote_ext_config)?;
}
@@ -817,64 +786,48 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
}
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
NodeId(id_str.parse().context("while parsing pageserver id")?)
} else {
DEFAULT_PAGESERVER_ID
};
Ok(PageServerNode::from_env(
env,
env.get_pageserver_conf(node_id)?,
))
}
let pageserver = PageServerNode::from_env(env);
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
if let Err(e) = get_pageserver(env, subcommand_args)?
.start(&pageserver_config_overrides(subcommand_args))
{
Some(("start", start_match)) => {
if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) {
eprintln!("pageserver start failed: {e}");
exit(1);
}
}
Some(("stop", subcommand_args)) => {
let immediate = subcommand_args
Some(("stop", stop_match)) => {
let immediate = stop_match
.get_one::<String>("stop-mode")
.map(|s| s.as_str())
== Some("immediate");
if let Err(e) = get_pageserver(env, subcommand_args)?.stop(immediate) {
if let Err(e) = pageserver.stop(immediate) {
eprintln!("pageserver stop failed: {}", e);
exit(1);
}
}
Some(("restart", subcommand_args)) => {
let pageserver = get_pageserver(env, subcommand_args)?;
Some(("restart", restart_match)) => {
//TODO what shutdown strategy should we use here?
if let Err(e) = pageserver.stop(false) {
eprintln!("pageserver stop failed: {}", e);
exit(1);
}
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) {
eprintln!("pageserver start failed: {e}");
exit(1);
}
}
Some(("status", subcommand_args)) => {
match get_pageserver(env, subcommand_args)?.check_status() {
Ok(_) => println!("Page server is up and running"),
Err(err) => {
eprintln!("Page server is not available: {}", err);
exit(1);
}
Some(("status", _)) => match PageServerNode::from_env(env).check_status() {
Ok(_) => println!("Page server is up and running"),
Err(err) => {
eprintln!("Page server is not available: {}", err);
exit(1);
}
}
},
Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
None => bail!("no pageserver subcommand provided"),
@@ -990,7 +943,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
broker::start_broker_process(env)?;
// Only start the attachment service if the pageserver is configured to need it
if env.control_plane_api.is_some() {
if env.pageserver.control_plane_api.is_some() {
let attachment_service = AttachmentService::from_env(env);
if let Err(e) = attachment_service.start() {
eprintln!("attachment_service start failed: {:#}", e);
@@ -999,13 +952,11 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
}
}
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true);
exit(1);
}
let pageserver = PageServerNode::from_env(env);
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
try_stop_all(env, true);
exit(1);
}
for node in env.safekeepers.iter() {
@@ -1029,6 +980,8 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
}
fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
let pageserver = PageServerNode::from_env(env);
// Stop all endpoints
match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => {
@@ -1043,11 +996,8 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
}
}
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.stop(immediate) {
eprintln!("pageserver {} stop failed: {:#}", ps_conf.id, e);
}
if let Err(e) = pageserver.stop(immediate) {
eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
}
for node in env.safekeepers.iter() {
@@ -1061,7 +1011,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
eprintln!("neon broker stop failed: {e:#}");
}
if env.control_plane_api.is_some() {
if env.pageserver.control_plane_api.is_some() {
let attachment_service = AttachmentService::from_env(env);
if let Err(e) = attachment_service.stop(immediate) {
eprintln!("attachment service stop failed: {e:#}");
@@ -1081,16 +1031,6 @@ fn cli() -> Command {
let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
// --id, when using a pageserver command
let pageserver_id_arg = Arg::new("pageserver-id")
.long("id")
.help("pageserver id")
.required(false);
// --pageserver-id when using a non-pageserver command
let endpoint_pageserver_id_arg = Arg::new("endpoint-pageserver-id")
.long("pageserver-id")
.required(false);
let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
.short('e')
.long("safekeeper-extra-opt")
@@ -1255,16 +1195,10 @@ fn cli() -> Command {
.arg_required_else_help(true)
.about("Manage pageserver")
.subcommand(Command::new("status"))
.arg(pageserver_id_arg.clone())
.subcommand(Command::new("start").about("Start local pageserver")
.arg(pageserver_id_arg.clone())
.arg(pageserver_config_args.clone()))
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
.subcommand(Command::new("stop").about("Stop local pageserver")
.arg(pageserver_id_arg.clone())
.arg(stop_mode_arg.clone()))
.subcommand(Command::new("restart").about("Restart local pageserver")
.arg(pageserver_id_arg.clone())
.arg(pageserver_config_args.clone()))
.subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
)
.subcommand(
Command::new("attachment_service")
@@ -1308,7 +1242,6 @@ fn cli() -> Command {
.arg(lsn_arg.clone())
.arg(pg_port_arg.clone())
.arg(http_port_arg.clone())
.arg(endpoint_pageserver_id_arg.clone())
.arg(
Arg::new("config-only")
.help("Don't do basebackup, create endpoint directory with only config files")
@@ -1326,7 +1259,6 @@ fn cli() -> Command {
.arg(lsn_arg)
.arg(pg_port_arg)
.arg(http_port_arg)
.arg(endpoint_pageserver_id_arg.clone())
.arg(pg_version_arg)
.arg(hot_standby_arg)
.arg(safekeepers_arg)

View File

@@ -70,7 +70,6 @@ pub struct EndpointConf {
http_port: u16,
pg_version: u32,
skip_pg_catalog_updates: bool,
pageserver_id: NodeId,
}
//
@@ -83,16 +82,19 @@ pub struct ComputeControlPlane {
pub endpoints: BTreeMap<String, Arc<Endpoint>>,
env: LocalEnv,
pageserver: Arc<PageServerNode>,
}
impl ComputeControlPlane {
// Load current endpoints from the endpoints/ subdirectories
pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
let pageserver = Arc::new(PageServerNode::from_env(&env));
let mut endpoints = BTreeMap::default();
for endpoint_dir in std::fs::read_dir(env.endpoints_path())
.with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
{
let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?;
let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
}
@@ -100,6 +102,7 @@ impl ComputeControlPlane {
base_port: 55431,
endpoints,
env,
pageserver,
})
}
@@ -122,29 +125,20 @@ impl ComputeControlPlane {
http_port: Option<u16>,
pg_version: u32,
mode: ComputeMode,
pageserver_id: NodeId,
) -> Result<Arc<Endpoint>> {
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
let pageserver =
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
let ep = Arc::new(Endpoint {
endpoint_id: endpoint_id.to_owned(),
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
env: self.env.clone(),
pageserver,
pageserver: Arc::clone(&self.pageserver),
timeline_id,
mode,
tenant_id,
pg_version,
// We don't setup roles and databases in the spec locally, so we don't need to
// do catalog updates. Catalog updates also include check availability
// data creation. Yet, we have tests that check that size and db dump
// before and after start are the same. So, skip catalog updates,
// with this we basically test a case of waking up an idle compute, where
// we also skip catalog updates in the cloud.
skip_pg_catalog_updates: true,
skip_pg_catalog_updates: false,
});
ep.create_endpoint_dir()?;
@@ -158,8 +152,7 @@ impl ComputeControlPlane {
http_port,
pg_port,
pg_version,
skip_pg_catalog_updates: true,
pageserver_id,
skip_pg_catalog_updates: false,
})?,
)?;
std::fs::write(
@@ -194,14 +187,18 @@ pub struct Endpoint {
// These are not part of the endpoint as such, but the environment
// the endpoint runs in.
pub env: LocalEnv,
pageserver: PageServerNode,
pageserver: Arc<PageServerNode>,
// Optimizations
skip_pg_catalog_updates: bool,
}
impl Endpoint {
fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
fn from_dir_entry(
entry: std::fs::DirEntry,
env: &LocalEnv,
pageserver: &Arc<PageServerNode>,
) -> Result<Endpoint> {
if !entry.file_type()?.is_dir() {
anyhow::bail!(
"Endpoint::from_dir_entry failed: '{}' is not a directory",
@@ -217,15 +214,12 @@ impl Endpoint {
let conf: EndpointConf =
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
let pageserver =
PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
Ok(Endpoint {
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
endpoint_id,
env: env.clone(),
pageserver,
pageserver: Arc::clone(pageserver),
timeline_id: conf.timeline_id,
mode: conf.mode,
tenant_id: conf.tenant_id,

View File

@@ -68,17 +68,11 @@ pub struct LocalEnv {
pub broker: NeonBroker,
/// This Vec must always contain at least one pageserver
pub pageservers: Vec<PageServerConf>,
pub pageserver: PageServerConf,
#[serde(default)]
pub safekeepers: Vec<SafekeeperConf>,
// Control plane location: if None, we will not run attachment_service. If set, this will
// be propagated into each pageserver's configuration.
#[serde(default)]
pub control_plane_api: Option<Url>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
#[serde(default)]
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
@@ -124,6 +118,9 @@ pub struct PageServerConf {
// auth type used for the PG and HTTP ports
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
// Control plane location
pub control_plane_api: Option<Url>,
}
impl Default for PageServerConf {
@@ -134,6 +131,7 @@ impl Default for PageServerConf {
listen_http_addr: String::new(),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
control_plane_api: None,
}
}
}
@@ -182,18 +180,26 @@ impl LocalEnv {
pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
let path = self.pg_distrib_dir.clone();
#[allow(clippy::manual_range_patterns)]
match pg_version {
14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
14 => Ok(path.join(format!("v{pg_version}"))),
15 => Ok(path.join(format!("v{pg_version}"))),
_ => bail!("Unsupported postgres version: {}", pg_version),
}
}
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
match pg_version {
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
_ => bail!("Unsupported postgres version: {}", pg_version),
}
}
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
match pg_version {
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
_ => bail!("Unsupported postgres version: {}", pg_version),
}
}
pub fn pageserver_bin(&self) -> PathBuf {
@@ -216,23 +222,15 @@ impl LocalEnv {
self.base_data_dir.join("endpoints")
}
pub fn pageserver_data_dir(&self, pageserver_id: NodeId) -> PathBuf {
self.base_data_dir
.join(format!("pageserver_{pageserver_id}"))
// TODO: move pageserver files into ./pageserver
pub fn pageserver_data_dir(&self) -> PathBuf {
self.base_data_dir.clone()
}
pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf {
self.base_data_dir.join("safekeepers").join(data_dir_name)
}
pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
Ok(conf)
} else {
bail!("could not find pageserver {id}")
}
}
pub fn register_branch_mapping(
&mut self,
branch_name: String,
@@ -309,10 +307,6 @@ impl LocalEnv {
env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
}
if env.pageservers.is_empty() {
anyhow::bail!("Configuration must contain at least one pageserver");
}
env.base_data_dir = base_path();
Ok(env)
@@ -345,7 +339,7 @@ impl LocalEnv {
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
// to .neon/config. TODO: We lose any formatting and comments along the way, which is
// a bit sad.
let mut conf_content = r#"# This file describes a local deployment of the page server
let mut conf_content = r#"# This file describes a locale deployment of the page server
# and safekeeeper node. It is read by the 'neon_local' command-line
# utility.
"#
@@ -475,9 +469,9 @@ impl LocalEnv {
}
fn auth_keys_needed(&self) -> bool {
self.pageservers.iter().any(|ps| {
ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
}) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
self.pageserver.pg_auth_type == AuthType::NeonJWT
|| self.pageserver.http_auth_type == AuthType::NeonJWT
|| self.safekeepers.iter().any(|sk| sk.auth_enabled)
}
}

View File

@@ -27,7 +27,6 @@ use utils::{
lsn::Lsn,
};
use crate::local_env::PageServerConf;
use crate::{background_process, local_env::LocalEnv};
#[derive(Error, Debug)]
@@ -77,40 +76,43 @@ impl ResponseErrorMessageExt for Response {
#[derive(Debug)]
pub struct PageServerNode {
pub pg_connection_config: PgConnectionConfig,
pub conf: PageServerConf,
pub env: LocalEnv,
pub http_client: Client,
pub http_base_url: String,
}
impl PageServerNode {
pub fn from_env(env: &LocalEnv, conf: &PageServerConf) -> PageServerNode {
let (host, port) =
parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
pub fn from_env(env: &LocalEnv) -> PageServerNode {
let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr)
.expect("Unable to parse listen_pg_addr");
let port = port.unwrap_or(5432);
Self {
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
conf: conf.clone(),
env: env.clone(),
http_client: Client::new(),
http_base_url: format!("http://{}/v1", conf.listen_http_addr),
http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
}
}
// pageserver conf overrides defined by neon_local configuration.
fn neon_local_overrides(&self) -> Vec<String> {
let id = format!("id={}", self.conf.id);
let id = format!("id={}", self.env.pageserver.id);
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
let pg_distrib_dir_param = format!(
"pg_distrib_dir='{}'",
self.env.pg_distrib_dir_raw().display()
);
let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
let http_auth_type_param =
format!("http_auth_type='{}'", self.env.pageserver.http_auth_type);
let listen_http_addr_param = format!(
"listen_http_addr='{}'",
self.env.pageserver.listen_http_addr
);
let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type);
let listen_pg_addr_param =
format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
@@ -124,18 +126,17 @@ impl PageServerNode {
broker_endpoint_param,
];
if let Some(control_plane_api) = &self.env.control_plane_api {
if let Some(control_plane_api) = &self.env.pageserver.control_plane_api {
overrides.push(format!(
"control_plane_api='{}'",
control_plane_api.as_str()
));
}
if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
if self.env.pageserver.http_auth_type != AuthType::Trust
|| self.env.pageserver.pg_auth_type != AuthType::Trust
{
// Keys are generated in the toplevel repo dir, pageservers' workdirs
// are one level below that, so refer to keys with ../
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
}
overrides
}
@@ -143,12 +144,16 @@ impl PageServerNode {
/// Initializes a pageserver node by creating its config with the overrides provided.
pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
self.pageserver_init(config_overrides)
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id,))
self.pageserver_init(config_overrides).with_context(|| {
format!(
"Failed to run init for pageserver node {}",
self.env.pageserver.id,
)
})
}
pub fn repo_path(&self) -> PathBuf {
self.env.pageserver_data_dir(self.conf.id)
self.env.pageserver_data_dir()
}
/// The pid file is created by the pageserver process, with its pid stored inside.
@@ -164,7 +169,7 @@ impl PageServerNode {
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
let datadir = self.repo_path();
let node_id = self.conf.id;
let node_id = self.env.pageserver.id;
println!(
"Initializing pageserver node {} at '{}' in {:?}",
node_id,
@@ -173,10 +178,6 @@ impl PageServerNode {
);
io::stdout().flush()?;
if !datadir.exists() {
std::fs::create_dir(&datadir)?;
}
let datadir_path_str = datadir.to_str().with_context(|| {
format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
})?;
@@ -207,7 +208,7 @@ impl PageServerNode {
let datadir = self.repo_path();
print!(
"Starting pageserver node {} at '{}' in {:?}",
self.conf.id,
self.env.pageserver.id,
self.pg_connection_config.raw_address(),
datadir
);
@@ -216,7 +217,7 @@ impl PageServerNode {
let datadir_path_str = datadir.to_str().with_context(|| {
format!(
"Cannot start pageserver node {} in path that has no string representation: {:?}",
self.conf.id, datadir,
self.env.pageserver.id, datadir,
)
})?;
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
@@ -260,7 +261,7 @@ impl PageServerNode {
// FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
// needs a token, and how to generate that token, seems independent to whether
// the pageserver requires a token in incoming requests.
Ok(if self.conf.http_auth_type != AuthType::Trust {
Ok(if self.env.pageserver.http_auth_type != AuthType::Trust {
// Generate a token to connect from the pageserver to a safekeeper
let token = self
.env
@@ -285,7 +286,7 @@ impl PageServerNode {
pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
let mut config = self.pg_connection_config.clone();
if self.conf.pg_auth_type == AuthType::NeonJWT {
if self.env.pageserver.pg_auth_type == AuthType::NeonJWT {
let token = self
.env
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
@@ -296,7 +297,7 @@ impl PageServerNode {
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
let mut builder = self.http_client.request(method, url);
if self.conf.http_auth_type == AuthType::NeonJWT {
if self.env.pageserver.http_auth_type == AuthType::NeonJWT {
let token = self
.env
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;

View File

@@ -30,7 +30,7 @@ cleanup() {
echo "clean up containers if exists"
cleanup
for pg_version in 14 15 16; do
for pg_version in 14 15; do
echo "start containers (pg_version=$pg_version)."
PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d

View File

@@ -1,281 +0,0 @@
# Crash-Consistent Layer Map Updates By Leveraging `index_part.json`
* Created on: Aug 23, 2023
* Author: Christian Schwarz
## Summary
This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage.
Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold.
## Motivation
### Background
We can currently easily make complex, atomic updates to the layer map by means of an RwLock.
If we crash or restart pageserver, we reconstruct the layer map from:
1. local timeline directory contents
2. remote `index_part.json` contents.
The function that is responsible for this is called `Timeline::load_layer_map()`.
The reconciliation process's behavior is the following:
* local-only files will become part of the layer map as local-only layers and rescheduled for upload
* For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map.
### The Problem
There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**.
The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers.
As stated above, making the update to the layer map in atomic way is trivial.
But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion.
Currently, we issue the system calls one by one and hope we don't crash.
What happens if we crash and restart in the middle of that system call sequence?
We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in.
We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make.
### Problem's Implications For Compaction
The implications of the above are primarily problematic for compaction.
Specifically, the part of it that compacts L0 layers into L1 layers.
Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files.
Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map.
It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part.
If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s.
This means the compaction after restart will **overwrite** the previously written L1s.
Currently we also schedule an S3 upload of the overwritten L1.
If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed.
*However*:
1. the file size of the overwritten L1s may not be identical, and
2. the bit pattern of the overwritten L1s may not be identical, and,
3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s.
But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite.
That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
Effectively, the data in the L1 has become inaccessible to node B.
If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum.
But if (3) ever happens, the logical content may be different, and, we could have truly lost data.
Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents.
**It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).**
## Design
Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load.
Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise.
During **timeline load**, the only thing that matters is the remote index part content.
Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines.
The local timeline dir's `metadata` file does not matter.
The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part.
Any layer files in the local timeline dir that aren't in the remote index part are removed during startup.
The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part.
Instead, it treats the remote index part as the authoritative layer map.
If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part.
If it doesn't match, we remove the file from the local timeline dir.
After load, **at runtime**, nothing changes compared to what we did before this RFC.
The procedure for single- and multi-object changes is reproduced here for reference:
* For any new layers that the change adds:
* Write them to a temporary location.
* While holding layer map lock:
* Move them to the final location.
* Insert into layer map.
* Make the S3 changes.
We won't reproduce the remote timeline client method calls here because these are subject to change.
Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change:
* PUT layer files inserted by the change.
* PUT an index part that has insertions and deletions of the change.
* DELETE the layer files that are deleted by the change.
Note that it is safe for the DELETE to be deferred arbitrarily.
* If it never happens, we leak the object, but, that's not a correctness concern.
* As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`.
* With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details).
## How This Solves The Problem
If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part.
The S3 change sequence above is obviously crash-consistent.
If we crash before the index part PUT, then we leak the inserted layer files to S3.
If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3.
Leaking is fine, it's a pre-existing condition and not addressed in this RFC.
Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent:
* atomic layer map update at runtime, currently by using an RwLock in write mode
* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
* local timeline dir state:
* irrelevant for layer map content => irrelevant for atomic updates / crash consistency
* if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
* if we crash before index part PUT, local layer files will be deleted
## Trade-Offs
### Fundamental
If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`:
* wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver
* compaction: we lose the entire compaction iteration work; need to re-do it again
* gc: no change to what we have today
If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work.
The amount of work to be re-do is capped to the lag of S3 changes to the local changes.
Assuming upload queue allows for unlimited queue depth (that's what it does today), this means:
* on-demand downloads that were needed to do the work: are likely still present, not lost
* wal ingest: currently unbounded
* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
* Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
* In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
* I have no intuition how expensive / long-running it is in reality.
* gc: `update_gc_info`` work (not substantial, AFAIK)
To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section).
However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable.
We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)).
However, this RFC is not constraining the design space either.
### Practical
#### Pageserver Restarts
Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case.
However, regular pageserver restart happen frequently, e.g., during weekly deploys.
In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
A longer budget would expose tenants that are done early to a longer downtime.
A short budget would risk throwing away more work that'd have to be re-done after restart.
In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3.
We can mitigate this problem as follows:
0. initially, by accepting that we need to do the work again
1. short-term, introducing measures to cap the amount of in-flight work:
- cap upload queue length, use backpressure to slow down compaction
- disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver`
- introducing a read-only shutdown state for tenants that are fast to shut down;
that state would be equivalent to the state of a tenant in hot standby / readonly mode.
2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it.
#### `disk_consistent_lsn` can go backwards
`disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT.
Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`.
Compute certainly doesn't care about `disk_consistent_lsn`.
## Side-Effects Of This Design
* local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load.
## Limitations
Multi-object changes that span multiple timelines aren't covered by this RFC.
That's fine because we currently don't need them, as evidenced by the absence
of a Pageserver operation that holds multiple timelines' layer map lock at a time.
## Impacted components
Primarily pageservers.
Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work.
No changes to safekeepers are needed.
## Alternatives considered
### Alternative 1: WAL
We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 .
The WAL would be used to
1. make multi-object changes atomic
2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay.
The WAL is appealing in a local-first world, but, it's much more complex than the design described above:
* New on-disk state to get right.
* Forward- and backward-compatibility development costs in the future.
### Alternative 2: Flow Everything Through `index_part.json`
We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`.
I.e., layer map would always be the last-persisted S3 state.
That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)).
And it might make hot standbys / read-only pageservers less of a special case in the future.
But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers.
And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload.
Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable.
### Alternative 3: Sequence Numbers For Layers
Instead of what's proposed in this RFC, we could use unique numbers to identify layer files:
```
# before
tenants/$tenant/timelines/$timeline/$key_and_lsn_range
# after
tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
```
To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
This alternative does not solve atomic layer map updates.
In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files.
We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents.
However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
The proposed design in this RFC addresses both.
So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
That way, we avoid a phase where the crash-during-compaction problem is accute.
## Related issues
- https://github.com/neondatabase/neon/issues/4749
- https://github.com/neondatabase/neon/issues/4418
- https://github.com/neondatabase/neon/pull/4422
- https://github.com/neondatabase/neon/issues/5077
- https://github.com/neondatabase/neon/issues/4088
- (re)resolutions:
- https://github.com/neondatabase/neon/pull/4696
- https://github.com/neondatabase/neon/pull/4094
- https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719
Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that assumption in order to fix the problem.
## Implementation Plan
1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part.
- The nasty part here is to fix all the tests that fiddle with the local timeline directory.
Possibly they are just irrelevant with this change, but, each case will require inspection.
2. Implement the design above.
- Initially, ship without the mitigations for restart and accept we will do some work twice.
- Measure the impact and implement one of the mitigations.

View File

@@ -89,8 +89,6 @@ impl RemoteExtSpec {
&self,
ext_name: &str,
is_library: bool,
build_tag: &str,
pg_major_version: &str,
) -> anyhow::Result<(String, RemotePath)> {
let mut real_ext_name = ext_name;
if is_library {
@@ -106,32 +104,11 @@ impl RemoteExtSpec {
.ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
}
// Check if extension is present in public or custom.
// If not, then it is not allowed to be used by this compute.
if let Some(public_extensions) = &self.public_extensions {
if !public_extensions.contains(&real_ext_name.to_string()) {
if let Some(custom_extensions) = &self.custom_extensions {
if !custom_extensions.contains(&real_ext_name.to_string()) {
return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
}
}
}
}
match self.extension_data.get(real_ext_name) {
Some(_ext_data) => {
// Construct the path to the extension archive
// BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
//
// Keep it in sync with path generation in
// https://github.com/neondatabase/build-custom-extensions/tree/main
let archive_path_str =
format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
Ok((
real_ext_name.to_string(),
RemotePath::from_string(&archive_path_str)?,
))
}
Some(ext_data) => Ok((
real_ext_name.to_string(),
RemotePath::from_string(&ext_data.archive_path)?,
)),
None => Err(anyhow::anyhow!(
"real_ext_name {} is not found",
real_ext_name

View File

@@ -12,6 +12,7 @@ const_format.workspace = true
anyhow.workspace = true
bytes.workspace = true
byteorder.workspace = true
hex.workspace = true
utils.workspace = true
postgres_ffi.workspace = true
enum-map.workspace = true

View File

@@ -1,22 +1,63 @@
//! Types in this file are for pageserver's upward-facing API calls to the control plane,
//! required for acquiring and validating tenant generation numbers.
//!
//! See docs/rfcs/025-generation-numbers.md
/// Types in this file are for pageserver's upward-facing API calls to the control plane
use hex::FromHex;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{NodeId, TenantId};
/// TenantId's serialization is an array of u8, which is rather unfriendly
/// for outside callers who aren't working with the native Rust TenantId.
/// This class wraps it in serialization that is just the hex strict
/// representation.
#[derive(Eq, PartialEq, Clone, Hash)]
pub struct HexTenantId(TenantId);
impl HexTenantId {
pub fn new(t: TenantId) -> Self {
Self(t)
}
pub fn take(self) -> TenantId {
self.0
}
}
impl AsRef<TenantId> for HexTenantId {
fn as_ref(&self) -> &TenantId {
&self.0
}
}
impl Serialize for HexTenantId {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let hex = self.0.hex_encode();
serializer.collect_str(&hex)
}
}
impl<'de> Deserialize<'de> for HexTenantId {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let string = String::deserialize(deserializer)?;
TenantId::from_hex(string)
.map(|t| HexTenantId::new(t))
.map_err(|e| serde::de::Error::custom(format!("{e}")))
}
}
// Top level s
#[derive(Serialize, Deserialize)]
pub struct ReAttachRequest {
pub node_id: NodeId,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct ReAttachResponseTenant {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId,
pub id: HexTenantId,
pub generation: u32,
}
@@ -25,11 +66,9 @@ pub struct ReAttachResponse {
pub tenants: Vec<ReAttachResponseTenant>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct ValidateRequestTenant {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId,
pub id: HexTenantId,
pub gen: u32,
}
@@ -43,10 +82,8 @@ pub struct ValidateResponse {
pub tenants: Vec<ValidateResponseTenant>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct ValidateResponseTenant {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId,
pub id: HexTenantId,
pub valid: bool,
}

View File

@@ -201,15 +201,6 @@ pub struct TenantCreateRequest {
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
}
#[serde_as]
#[derive(Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLoadRequest {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub generation: Option<u32>,
}
impl std::ops::Deref for TenantCreateRequest {
type Target = TenantConfig;
@@ -381,8 +372,6 @@ pub struct TimelineInfo {
pub pg_version: u32,
pub state: TimelineState,
pub walreceiver_status: String,
}
#[derive(Debug, Clone, Serialize)]

View File

@@ -10,11 +10,9 @@ should be auto-generated too, but that's a TODO.
The PostgreSQL on-disk file format is not portable across different
CPU architectures and operating systems. It is also subject to change
in each major PostgreSQL version. Currently, this module supports
PostgreSQL v14, v15 and v16: bindings and code that depends on them are
version-specific.
This code is organized in modules `postgres_ffi::v14`, `postgres_ffi::v15` and
`postgres_ffi::v16`. Version independent code is explicitly exported into
shared `postgres_ffi`.
PostgreSQL v14 and v15: bindings and code that depends on them are version-specific.
This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15`
Version independend code is explicitly exported into shared `postgres_ffi`.
TODO: Currently, there is also some code that deals with WAL records

View File

@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
PathBuf::from("pg_install")
};
for pg_version in &["v14", "v15", "v16"] {
for pg_version in &["v14", "v15"] {
let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
if pg_install_dir_versioned.is_relative() {
let cwd = env::current_dir().context("Failed to get current_dir")?;
@@ -125,7 +125,6 @@ fn main() -> anyhow::Result<()> {
.allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
.allowlist_type("PageHeaderData")
.allowlist_type("DBState")
.allowlist_type("RelMapFile")
// Because structs are used for serialization, tell bindgen to emit
// explicit padding fields.
.explicit_padding(true)

View File

@@ -51,59 +51,11 @@ macro_rules! for_all_postgres_versions {
($macro:tt) => {
$macro!(v14);
$macro!(v15);
$macro!(v16);
};
}
for_all_postgres_versions! { postgres_ffi }
/// dispatch_pgversion
///
/// Run a code block in a context where the postgres_ffi bindings for a
/// specific (supported) PostgreSQL version are `use`-ed in scope under the pgv
/// identifier.
/// If the provided pg_version is not supported, we panic!(), unless the
/// optional third argument was provided (in which case that code will provide
/// the default handling instead).
///
/// Use like
///
/// dispatch_pgversion!(my_pgversion, { pgv::constants::XLOG_DBASE_CREATE })
/// dispatch_pgversion!(my_pgversion, pgv::constants::XLOG_DBASE_CREATE)
///
/// Other uses are for macro-internal purposes only and strictly unsupported.
///
#[macro_export]
macro_rules! dispatch_pgversion {
($version:expr, $code:expr) => {
dispatch_pgversion!($version, $code, panic!("Unknown PostgreSQL version {}", $version))
};
($version:expr, $code:expr, $invalid_pgver_handling:expr) => {
dispatch_pgversion!(
$version => $code,
default = $invalid_pgver_handling,
pgversions = [
14 : v14,
15 : v15,
16 : v16,
]
)
};
($pgversion:expr => $code:expr,
default = $default:expr,
pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => {
match ($pgversion) {
$($sv => {
use $crate::$vsv as pgv;
$code
},)+
_ => {
$default
}
}
};
}
pub mod pg_constants;
pub mod relfile_utils;
@@ -138,7 +90,13 @@ pub use v14::xlog_utils::XLogFileName;
pub use v14::bindings::DBState_DB_SHUTDOWNED;
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
match version {
14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
_ => anyhow::bail!("Unknown version {}", version),
}
}
pub fn generate_wal_segment(
@@ -149,11 +107,11 @@ pub fn generate_wal_segment(
) -> Result<Bytes, SerializeError> {
assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));
dispatch_pgversion!(
pg_version,
pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn),
Err(SerializeError::BadInput)
)
match pg_version {
14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
_ => Err(SerializeError::BadInput),
}
}
pub fn generate_pg_control(
@@ -162,11 +120,11 @@ pub fn generate_pg_control(
lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<(Bytes, u64)> {
dispatch_pgversion!(
pg_version,
pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
anyhow::bail!("Unknown version {}", pg_version)
)
match pg_version {
14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
_ => anyhow::bail!("Unknown version {}", pg_version),
}
}
// PG timeline is always 1, changing it doesn't have any useful meaning in Neon.
@@ -238,6 +196,8 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
}
pub mod waldecoder {
use crate::{v14, v15};
use bytes::{Buf, Bytes, BytesMut};
use std::num::NonZeroU32;
use thiserror::Error;
@@ -288,17 +248,22 @@ pub mod waldecoder {
}
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
dispatch_pgversion!(
self.pg_version,
{
use pgv::waldecoder_handler::WalStreamDecoderHandler;
match self.pg_version {
// This is a trick to support both versions simultaneously.
// See WalStreamDecoderHandler comments.
14 => {
use self::v14::waldecoder_handler::WalStreamDecoderHandler;
self.poll_decode_internal()
},
Err(WalDecodeError {
}
15 => {
use self::v15::waldecoder_handler::WalStreamDecoderHandler;
self.poll_decode_internal()
}
_ => Err(WalDecodeError {
msg: format!("Unknown version {}", self.pg_version),
lsn: self.lsn,
})
)
}),
}
}
}
}

View File

@@ -163,20 +163,6 @@ pub const RM_HEAP2_ID: u8 = 9;
pub const RM_HEAP_ID: u8 = 10;
pub const RM_LOGICALMSG_ID: u8 = 21;
// from neon_rmgr.h
pub const RM_NEON_ID: u8 = 134;
pub const XLOG_NEON_HEAP_INIT_PAGE: u8 = 0x80;
pub const XLOG_NEON_HEAP_INSERT: u8 = 0x00;
pub const XLOG_NEON_HEAP_DELETE: u8 = 0x10;
pub const XLOG_NEON_HEAP_UPDATE: u8 = 0x20;
pub const XLOG_NEON_HEAP_HOT_UPDATE: u8 = 0x30;
pub const XLOG_NEON_HEAP_LOCK: u8 = 0x40;
pub const XLOG_NEON_HEAP_MULTI_INSERT: u8 = 0x50;
pub const XLOG_NEON_HEAP_VISIBLE: u8 = 0x40;
// from xlogreader.h
pub const XLR_INFO_MASK: u8 = 0x0F;
pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;

View File

@@ -3,8 +3,3 @@ pub const XLOG_DBASE_DROP: u8 = 0x10;
pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
(bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
}

View File

@@ -1,18 +1,10 @@
pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00;
pub const XLOG_DBASE_DROP: u8 = 0x20;
pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
(bimg_info & ANY_COMPRESS_FLAG) != 0
}

View File

@@ -1,18 +0,0 @@
pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
pub const XLOG_DBASE_DROP: u8 = 0x20;
pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
(bimg_info & ANY_COMPRESS_FLAG) != 0
}

View File

@@ -49,9 +49,9 @@ impl Conf {
pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
let path = self.pg_distrib_dir.clone();
#[allow(clippy::manual_range_patterns)]
match self.pg_version {
14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))),
14 => Ok(path.join(format!("v{}", self.pg_version))),
15 => Ok(path.join(format!("v{}", self.pg_version))),
_ => bail!("Unsupported postgres version: {}", self.pg_version),
}
}
@@ -250,18 +250,11 @@ fn craft_internal<C: postgres::GenericClient>(
let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
let last_lsn = match last_lsn {
None => client.pg_current_wal_insert_lsn()?,
Some(last_lsn) => {
let insert_lsn = client.pg_current_wal_insert_lsn()?;
match last_lsn.cmp(&insert_lsn) {
Ordering::Less => bail!(
"Some records were inserted after the crafted WAL: {} vs {}",
last_lsn,
insert_lsn
),
Ordering::Equal => last_lsn,
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
}
}
Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
Ordering::Equal => last_lsn,
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
},
};
if !intermediate_lsns.starts_with(&[initial_lsn]) {
intermediate_lsns.insert(0, initial_lsn);
@@ -370,9 +363,8 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
);
ensure!(
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
after_xlog_switch,
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
"XLOG_SWITCH message ended not on page boundary: {}",
after_xlog_switch
);
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
}

View File

@@ -959,7 +959,7 @@ mod tests {
let make_params = |options| StartupMessageParams::new([("options", options)]);
let params = StartupMessageParams::new([]);
assert!(params.options_escaped().is_none());
assert!(matches!(params.options_escaped(), None));
let params = make_params("");
assert!(split_options(&params).is_empty());

View File

@@ -13,13 +13,14 @@ use std::{
collections::HashMap,
fmt::Debug,
num::{NonZeroU32, NonZeroUsize},
path::{Path, PathBuf},
path::{Path, PathBuf, StripPrefixError},
pin::Pin,
sync::Arc,
};
use anyhow::{bail, Context};
use serde::{Deserialize, Serialize};
use tokio::io;
use toml_edit::Item;
use tracing::info;
@@ -44,12 +45,34 @@ pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
// From the S3 spec
pub const MAX_KEYS_PER_DELETE: usize = 1000;
/// Path on the remote storage, relative to some inner prefix.
/// The prefix is an implementation detail, that allows representing local paths
/// as the remote ones, stripping the local storage prefix away.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RemotePath(PathBuf);
impl Serialize for RemotePath {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(self)
}
}
impl<'de> Deserialize<'de> for RemotePath {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let str = String::deserialize(deserializer)?;
Ok(Self(PathBuf::from(&str)))
}
}
impl std::fmt::Display for RemotePath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0.display())
@@ -88,6 +111,15 @@ impl RemotePath {
pub fn extension(&self) -> Option<&str> {
self.0.extension()?.to_str()
}
/// Unwrap the PathBuf that RemotePath wraps
pub fn take(self) -> PathBuf {
self.0
}
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, StripPrefixError> {
self.0.strip_prefix(&p.0)
}
}
/// Storage (potentially remote) API to manage its state.
@@ -166,6 +198,8 @@ pub enum DownloadError {
BadInput(anyhow::Error),
/// The file was not found in the remote storage.
NotFound,
/// The client was shut down
Shutdown,
/// The file was found in the remote storage, but the download failed.
Other(anyhow::Error),
}
@@ -177,6 +211,7 @@ impl std::fmt::Display for DownloadError {
write!(f, "Failed to download a remote file due to user input: {e}")
}
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
DownloadError::Shutdown => write!(f, "Client shutting down"),
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
}
}
@@ -241,6 +276,18 @@ impl GenericRemoteStorage {
}
}
/// For small, simple downloads where caller doesn't want to handle the streaming: return the full body
pub async fn download_all(&self, from: &RemotePath) -> Result<Vec<u8>, DownloadError> {
let mut download = self.download(from).await?;
let mut bytes = Vec::new();
tokio::io::copy(&mut download.download_stream, &mut bytes)
.await
.with_context(|| format!("Failed to download body from {from}"))
.map_err(DownloadError::Other)?;
Ok(bytes)
}
pub async fn download_byte_range(
&self,
from: &RemotePath,

View File

@@ -155,20 +155,18 @@ impl RemoteStorage for LocalFs {
// the local filesystem we need a directory to start calling read_dir on.
let mut initial_dir = full_path.clone();
match fs::metadata(full_path.clone()).await {
Err(e) => {
// It's not a file that exists: strip the prefix back to the parent directory
if matches!(e.kind(), ErrorKind::NotFound) {
initial_dir.pop();
}
}
Ok(meta) => {
if !meta.is_dir() {
// It's not a directory: strip back to the parent
initial_dir.pop();
}
}
Err(e) if e.kind() == ErrorKind::NotFound => {
// It's not a file that exists: strip the prefix back to the parent directory
initial_dir.pop();
}
Err(e) => {
// Unexpected I/O error
anyhow::bail!(e)
}
}
// Note that PathBuf starts_with only considers full path segments, but

View File

@@ -22,7 +22,7 @@ use aws_sdk_s3::{
Client,
};
use aws_smithy_http::body::SdkBody;
use hyper::Body;
use hyper::{Body, StatusCode};
use scopeguard::ScopeGuard;
use tokio::{
io::{self, AsyncRead},
@@ -529,7 +529,16 @@ impl RemoteStorage for S3Bucket {
}
}
Err(e) => {
return Err(e.into());
if let Some(r) = e.raw_response() {
if r.http().status() == StatusCode::NOT_FOUND {
// 404 is acceptable for deletions. AWS S3 does not return this, but
// some other implementations might (e.g. GCS XML API returns 404 on DeleteObject
// to a missing key)
continue;
} else {
return Err(anyhow::format_err!("DeleteObjects response error: {e}"));
}
}
}
}
}
@@ -573,7 +582,7 @@ mod tests {
#[test]
fn relative_path() {
let all_paths = ["", "some/path", "some/path/"];
let all_paths = vec!["", "some/path", "some/path/"];
let all_paths: Vec<RemotePath> = all_paths
.iter()
.map(|x| RemotePath::new(Path::new(x)).expect("bad path"))

View File

@@ -9,12 +9,11 @@ PORT=$4
SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
rm -fr "$DATA_DIR"
env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
echo port="$PORT" >> "$DATA_DIR"/postgresql.conf
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
declare -i WAL_SIZE=$REDO_POS+114
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate
cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done

View File

@@ -1,13 +1,8 @@
use std::fmt::Debug;
use std::fmt::Display;
use serde::{Deserialize, Serialize};
/// Tenant generations are used to provide split-brain safety and allow
/// multiple pageservers to attach the same tenant concurrently.
///
/// See docs/rfcs/025-generation-numbers.md for detail on how generation
/// numbers are used.
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
pub enum Generation {
// Generations with this magic value will not add a suffix to S3 keys, and will not
// be included in persisted index_part.json. This value is only to be used
@@ -53,7 +48,6 @@ impl Generation {
matches!(self, Self::None)
}
#[track_caller]
pub fn get_suffix(&self) -> String {
match self {
Self::Valid(v) => {
@@ -66,27 +60,19 @@ impl Generation {
}
}
/// `suffix` is the part after "-" in a key
///
/// Returns None if parsing was unsuccessful
pub fn parse_suffix(suffix: &str) -> Option<Generation> {
u32::from_str_radix(suffix, 16).map(Generation::new).ok()
pub fn previous(&self) -> Self {
if let Self::Valid(v) = self {
Self::new(v - 1)
} else {
Self::none()
}
}
#[track_caller]
pub fn previous(&self) -> Generation {
match self {
Self::Valid(n) => {
if *n == 0 {
// Since a tenant may be upgraded from a pre-generations state, interpret the "previous" generation
// to 0 as being "no generation".
Self::None
} else {
Self::Valid(n - 1)
}
}
Self::None => Self::None,
Self::Broken => panic!("Attempted to use a broken generation"),
pub fn into(self) -> Option<u32> {
if let Self::Valid(v) = self {
Some(v)
} else {
None
}
}
}
@@ -103,7 +89,7 @@ impl Serialize for Generation {
// that include an optional generation should convert None to an
// Option<Generation>::None
Err(serde::ser::Error::custom(
"Tried to serialize invalid generation ({self})",
"Tried to serialize invalid generation",
))
}
}
@@ -118,10 +104,7 @@ impl<'de> Deserialize<'de> for Generation {
}
}
// We intentionally do not implement Display for Generation, to reduce the
// risk of a bug where the generation is used in a format!() string directly
// instead of using get_suffix().
impl Debug for Generation {
impl Display for Generation {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Valid(v) => {

View File

@@ -24,6 +24,9 @@ pub enum ApiError {
#[error("Precondition failed: {0}")]
PreconditionFailed(Box<str>),
#[error("Shutting down")]
ShuttingDown,
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -52,6 +55,10 @@ impl ApiError {
self.to_string(),
StatusCode::PRECONDITION_FAILED,
),
ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
"Shutting down".to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,

View File

@@ -50,7 +50,7 @@ impl Id {
Id::from(tli_buf)
}
fn hex_encode(&self) -> String {
pub fn hex_encode(&self) -> String {
static HEX: &[u8] = b"0123456789abcdef";
let mut buf = vec![0u8; self.0.len() * 2];
@@ -133,6 +133,10 @@ macro_rules! id_newtype {
pub const fn from_array(b: [u8; 16]) -> Self {
$t(Id(b))
}
pub fn hex_encode(&self) -> String {
self.0.hex_encode()
}
}
impl FromStr for $t {
@@ -244,13 +248,13 @@ id_newtype!(TenantId);
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
/// See [`Id`] for alternative ways to serialize it.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct ConnectionId(Id);
id_newtype!(ConnectionId);
// A pair uniquely identifying Neon instance.
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct TenantTimelineId {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
@@ -273,6 +277,36 @@ impl TenantTimelineId {
}
}
impl Serialize for TenantTimelineId {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(self)
}
}
impl<'de> Deserialize<'de> for TenantTimelineId {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let str = String::deserialize(deserializer)?;
if let Some((tenant_part, timeline_part)) = str.split_once('/') {
Ok(Self {
tenant_id: TenantId(Id::from_hex(tenant_part).map_err(|e| {
serde::de::Error::custom(format!("Malformed tenant in TenantTimelineId: {e}"))
})?),
timeline_id: TimelineId(Id::from_hex(timeline_part).map_err(|e| {
serde::de::Error::custom(format!("Malformed timeline in TenantTimelineId {e}"))
})?),
})
} else {
Err(serde::de::Error::custom("Malformed TenantTimelineId"))
}
}
}
impl fmt::Display for TenantTimelineId {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}/{}", self.tenant_id, self.timeline_id)

View File

@@ -178,17 +178,14 @@ pub async fn ws_handler(
/// Starts the monitor. If startup fails or the monitor exits, an error will
/// be logged and our internal state will be reset to allow for new connections.
#[tracing::instrument(skip_all)]
#[tracing::instrument(skip_all, fields(?args))]
async fn start_monitor(
ws: WebSocket,
args: &Args,
kill: broadcast::Receiver<()>,
token: CancellationToken,
) {
info!(
?args,
"accepted new websocket connection -> starting monitor"
);
info!("accepted new websocket connection -> starting monitor");
let timeout = Duration::from_secs(4);
let monitor = tokio::time::timeout(
timeout,

View File

@@ -5,7 +5,6 @@
//! all functionality.
use std::sync::Arc;
use std::time::{Duration, Instant};
use std::{fmt::Debug, mem};
use anyhow::{bail, Context};
@@ -37,8 +36,6 @@ pub struct Runner {
/// by us vs the autoscaler-agent.
counter: usize,
last_upscale_request_at: Option<Instant>,
/// A signal to kill the main thread produced by `self.run()`. This is triggered
/// when the server receives a new connection. When the thread receives the
/// signal off this channel, it will gracefully shutdown.
@@ -102,7 +99,6 @@ impl Runner {
cgroup: None,
dispatcher,
counter: 1, // NB: must be odd, see the comment about the field for more.
last_upscale_request_at: None,
kill,
};
@@ -401,20 +397,6 @@ impl Runner {
if request.is_none() {
bail!("failed to listen for upscale event from cgroup")
}
// If it's been less than 1 second since the last time we requested upscaling,
// ignore the event, to avoid spamming the agent (otherwise, this can happen
// ~1k times per second).
if let Some(t) = self.last_upscale_request_at {
let elapsed = t.elapsed();
if elapsed < Duration::from_secs(1) {
info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
continue;
}
}
self.last_upscale_request_at = Some(Instant::now());
info!("cgroup asking for upscale; forwarding request");
self.counter += 2; // Increment, preserving parity (i.e. keep the
// counter odd). See the field comment for more.

View File

@@ -3,7 +3,6 @@
//! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.
use anyhow::Result;
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::ops::Range;
@@ -97,7 +96,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let file = FileBlockReader::new(VirtualFile::open(path)?);
let summary_blk = file.read_blk(0).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
@@ -143,12 +142,12 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
let mut total_delta_layers = 0usize;
let mut total_image_layers = 0usize;
let mut total_excess_layers = 0usize;
for tenant in fs::read_dir(storage_path.join(TENANTS_SEGMENT_NAME))? {
for tenant in fs::read_dir(storage_path.join("tenants"))? {
let tenant = tenant?;
if !tenant.file_type()?.is_dir() {
continue;
}
for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
for timeline in fs::read_dir(tenant.path().join("timelines"))? {
let timeline = timeline?;
if !timeline.file_type()?.is_dir() {
continue;

View File

@@ -5,7 +5,6 @@ use clap::Subcommand;
use pageserver::tenant::block_io::BlockCursor;
use pageserver::tenant::disk_btree::DiskBtreeReader;
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use pageserver::{page_cache, virtual_file};
use pageserver::{
repository::{Key, KEY_SIZE},
@@ -48,7 +47,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
let path = path.as_ref();
virtual_file::init(10);
page_cache::init(100);
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let file = FileBlockReader::new(VirtualFile::open(path)?);
let summary_blk = file.read_blk(0).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
@@ -69,7 +68,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
},
)
.await?;
let cursor = BlockCursor::new_fileblockreader(&file);
let cursor = BlockCursor::new_fileblockreader_virtual(&file);
for (k, v) in all {
let value = cursor.read_blob(v.pos()).await?;
println!("key:{} value_len:{}", k, value.len());
@@ -81,13 +80,13 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
match cmd {
LayerCmd::List { path } => {
for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? {
for tenant in fs::read_dir(path.join("tenants"))? {
let tenant = tenant?;
if !tenant.file_type()?.is_dir() {
continue;
}
println!("tenant {}", tenant.file_name().to_string_lossy());
for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
for timeline in fs::read_dir(tenant.path().join("timelines"))? {
let timeline = timeline?;
if !timeline.file_type()?.is_dir() {
continue;
@@ -102,9 +101,9 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
timeline,
} => {
let timeline_path = path
.join(TENANTS_SEGMENT_NAME)
.join("tenants")
.join(tenant)
.join(TIMELINES_SEGMENT_NAME)
.join("timelines")
.join(timeline);
let mut idx = 0;
for layer in fs::read_dir(timeline_path)? {

View File

@@ -25,7 +25,6 @@ use crate::context::RequestContext;
use crate::tenant::Timeline;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::dispatch_pgversion;
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
@@ -324,25 +323,14 @@ where
.timeline
.get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
.await?;
ensure!(
img.len()
== dispatch_pgversion!(
self.timeline.pg_version,
pgv::bindings::SIZEOF_RELMAPFILE
)
);
ensure!(img.len() == 512);
Some(img)
} else {
None
};
if spcnode == GLOBALTABLESPACE_OID {
let pg_version_str = match self.timeline.pg_version {
14 | 15 => self.timeline.pg_version.to_string(),
ver => format!("{ver}\x0A"),
};
let pg_version_str = self.timeline.pg_version.to_string();
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
self.ar.append(&header, pg_version_str.as_bytes()).await?;
@@ -386,10 +374,7 @@ where
if let Some(img) = relmap_img {
let dst_path = format!("base/{}/PG_VERSION", dbnode);
let pg_version_str = match self.timeline.pg_version {
14 | 15 => self.timeline.pg_version.to_string(),
ver => format!("{ver}\x0A"),
};
let pg_version_str = self.timeline.pg_version.to_string();
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
self.ar.append(&header, pg_version_str.as_bytes()).await?;

View File

@@ -2,12 +2,14 @@
use std::env::{var, VarError};
use std::sync::Arc;
use std::time::Duration;
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::deletion_queue::{DeletionQueue, DeletionQueueError};
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -349,6 +351,35 @@ fn start_pageserver(
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
// Set up deletion queue
let deletion_queue_cancel = tokio_util::sync::CancellationToken::new();
let (deletion_queue, deletion_frontend, deletion_backend, deletion_executor) =
DeletionQueue::new(remote_storage.clone(), conf, deletion_queue_cancel.clone());
if let Some(mut deletion_frontend) = deletion_frontend {
BACKGROUND_RUNTIME.spawn(async move {
deletion_frontend
.background()
.instrument(info_span!(parent:None, "deletion frontend"))
.await
});
}
if let Some(mut deletion_backend) = deletion_backend {
BACKGROUND_RUNTIME.spawn(async move {
deletion_backend
.background()
.instrument(info_span!(parent: None, "deletion backend"))
.await
});
}
if let Some(mut deletion_executor) = deletion_executor {
BACKGROUND_RUNTIME.spawn(async move {
deletion_executor
.background()
.instrument(info_span!(parent: None, "deletion executor"))
.await
});
}
// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.
startup_checkpoint("initial", "Starting loading tenants");
@@ -386,9 +417,9 @@ fn start_pageserver(
TenantSharedResources {
broker_client: broker_client.clone(),
remote_storage: remote_storage.clone(),
deletion_queue_client: deletion_queue.new_client(),
},
order,
shutdown_pageserver.clone(),
))?;
BACKGROUND_RUNTIME.spawn({
@@ -477,19 +508,17 @@ fn start_pageserver(
{
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
let router_state = Arc::new(
http::routes::State::new(
conf,
http_auth.clone(),
remote_storage,
broker_client.clone(),
disk_usage_eviction_state,
)
.context("Failed to initialize router state")?,
);
let router = http::make_router(router_state, launch_ts, http_auth.clone())?
.build()
.map_err(|err| anyhow!(err))?;
let router = http::make_router(
conf,
launch_ts,
http_auth,
broker_client.clone(),
remote_storage,
deletion_queue.clone(),
disk_usage_eviction_state,
)?
.build()
.map_err(|err| anyhow!(err))?;
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?
.serve(service)
@@ -608,6 +637,36 @@ fn start_pageserver(
// The plan is to change that over time.
shutdown_pageserver.take();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
// Best effort to persist any outstanding deletions, to avoid leaking objects
let dq = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(async move {
match tokio::time::timeout(Duration::from_secs(5), dq.new_client().flush()).await {
Ok(flush_r) => {
match flush_r {
Ok(()) => {
info!("Deletion queue flushed successfully on shutdown")
}
Err(e) => {
match e {
DeletionQueueError::ShuttingDown => {
// This is not harmful for correctness, but is unexpected: the deletion
// queue's workers should stay alive as long as there are any client handles instantiated.
warn!("Deletion queue stopped prematurely");
}
}
}
}
}
Err(e) => {
warn!("Timed out flushing deletion queue on shutdown ({e})")
}
}
});
// Clean shutdown of deletion queue workers
deletion_queue_cancel.cancel();
unreachable!()
}
})

View File

@@ -32,8 +32,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
use crate::tenant::config::TenantConf;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{
TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
TIMELINES_SEGMENT_NAME,
TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
};
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
@@ -73,7 +72,7 @@ pub mod defaults {
/// Default built-in configuration file.
///
pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
r#"
r###"
# Initial configuration file created by 'pageserver --init'
#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
@@ -118,7 +117,7 @@ pub mod defaults {
[remote_storage]
"#
"###
);
}
@@ -577,7 +576,28 @@ impl PageServerConf {
//
pub fn tenants_path(&self) -> PathBuf {
self.workdir.join(TENANTS_SEGMENT_NAME)
self.workdir.join("tenants")
}
pub fn deletion_prefix(&self) -> PathBuf {
self.workdir.join("deletion")
}
pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
// Encode a version in the filename, so that if we ever switch away from JSON we can
// increment this.
const VERSION: u8 = 1;
self.deletion_prefix()
.join(format!("{sequence:016x}-{VERSION:02x}.list"))
}
pub fn deletion_header_path(&self) -> PathBuf {
// Encode a version in the filename, so that if we ever switch away from JSON we can
// increment this.
const VERSION: u8 = 1;
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
}
pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
@@ -668,18 +688,26 @@ impl PageServerConf {
pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
let path = self.pg_distrib_dir.clone();
#[allow(clippy::manual_range_patterns)]
match pg_version {
14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
14 => Ok(path.join(format!("v{pg_version}"))),
15 => Ok(path.join(format!("v{pg_version}"))),
_ => bail!("Unsupported postgres version: {}", pg_version),
}
}
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
match pg_version {
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
_ => bail!("Unsupported postgres version: {}", pg_version),
}
}
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
match pg_version {
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
_ => bail!("Unsupported postgres version: {}", pg_version),
}
}
/// Parse a configuration file (pageserver.toml) into a PageServerConf struct,

View File

@@ -1,119 +0,0 @@
use std::collections::HashMap;
use hyper::StatusCode;
use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{
backoff,
generation::Generation,
id::{NodeId, TenantId},
};
use crate::config::PageServerConf;
// Backoffs when control plane requests do not succeed: compromise between reducing load
// on control plane, and retrying frequently when we are blocked on a control plane
// response to make progress.
const BACKOFF_INCREMENT: f64 = 0.1;
const BACKOFF_MAX: f64 = 10.0;
/// The Pageserver's client for using the control plane API: this is a small subset
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
pub(crate) struct ControlPlaneClient {
http_client: reqwest::Client,
base_url: Url,
node_id: NodeId,
cancel: CancellationToken,
}
impl ControlPlaneClient {
/// A None return value indicates that the input `conf` object does not have control
/// plane API enabled.
pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
let mut url = match conf.control_plane_api.as_ref() {
Some(u) => u.clone(),
None => return None,
};
if let Ok(mut segs) = url.path_segments_mut() {
// This ensures that `url` ends with a slash if it doesn't already.
// That way, we can subsequently use join() to safely attach extra path elements.
segs.pop_if_empty().push("");
}
let client = reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client");
Some(Self {
http_client: client,
base_url: url,
node_id: conf.id,
cancel: cancel.clone(),
})
}
async fn try_re_attach(
&self,
url: Url,
request: &ReAttachRequest,
) -> anyhow::Result<ReAttachResponse> {
match self.http_client.post(url).json(request).send().await {
Err(e) => Err(anyhow::Error::from(e)),
Ok(r) => {
if r.status() == StatusCode::OK {
r.json::<ReAttachResponse>()
.await
.map_err(anyhow::Error::from)
} else {
Err(anyhow::anyhow!("Unexpected status {}", r.status()))
}
}
}
}
/// Block until we get a successful response
pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
let re_attach_path = self
.base_url
.join("re-attach")
.expect("Failed to build re-attach path");
let request = ReAttachRequest {
node_id: self.node_id,
};
let mut attempt = 0;
loop {
let result = self.try_re_attach(re_attach_path.clone(), &request).await;
match result {
Ok(res) => {
tracing::info!(
"Received re-attach response with {} tenants",
res.tenants.len()
);
return Ok(res
.tenants
.into_iter()
.map(|t| (t.id, Generation::new(t.generation)))
.collect::<HashMap<_, _>>());
}
Err(e) => {
tracing::error!("Error re-attaching tenants, retrying: {e:#}");
backoff::exponential_backoff(
attempt,
BACKOFF_INCREMENT,
BACKOFF_MAX,
&self.cancel,
)
.await;
if self.cancel.is_cancelled() {
return Err(anyhow::anyhow!("Shutting down"));
}
attempt += 1;
}
}
}
}
}

View File

@@ -0,0 +1,850 @@
mod backend;
mod executor;
mod frontend;
use std::collections::HashMap;
use std::path::PathBuf;
use crate::metrics::DELETION_QUEUE_SUBMITTED;
use crate::tenant::remote_timeline_client::remote_timeline_path;
use remote_storage::{GenericRemoteStorage, RemotePath};
use serde::Deserialize;
use serde::Serialize;
use serde_with::serde_as;
use thiserror::Error;
use tokio;
use tokio_util::sync::CancellationToken;
use tracing::{self, debug, error};
use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
pub(crate) use self::backend::BackendQueueWorker;
use self::executor::ExecutorWorker;
use self::frontend::DeletionOp;
pub(crate) use self::frontend::FrontendQueueWorker;
use backend::BackendQueueMessage;
use executor::ExecutorMessage;
use frontend::FrontendQueueMessage;
use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
// TODO: adminstrative "panic button" config property to disable all deletions
// TODO: configurable for how long to wait before executing deletions
/// We aggregate object deletions from many tenants in one place, for several reasons:
/// - Coalesce deletions into fewer DeleteObjects calls
/// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes
/// to flush any outstanding deletions.
/// - Globally control throughput of deletions, as these are a low priority task: do
/// not compete with the same S3 clients/connections used for higher priority uploads.
/// - Future: enable validating that we may do deletions in a multi-attached scenario,
/// via generation numbers (see https://github.com/neondatabase/neon/pull/4919)
///
/// There are two kinds of deletion: deferred and immediate. A deferred deletion
/// may be intentionally delayed to protect passive readers of S3 data, and may
/// be subject to a generation number validation step. An immediate deletion is
/// ready to execute immediately, and is only queued up so that it can be coalesced
/// with other deletions in flight.
///
/// Deferred deletions pass through three steps:
/// - Frontend: accumulate deletion requests from Timelines, and batch them up into
/// DeletionLists, which are persisted to S3.
/// - Backend: accumulate deletion lists, and validate them en-masse prior to passing
/// the keys in the list onward for actual deletion
/// - Executor: accumulate object keys that the backend has validated for immediate
/// deletion, and execute them in batches of 1000 keys via DeleteObjects.
///
/// Non-deferred deletions, such as during timeline deletion, bypass the first
/// two stages and are passed straight into the Executor.
///
/// Internally, each stage is joined by a channel to the next. In S3, there is only
/// one queue (of DeletionLists), which is written by the frontend and consumed
/// by the backend.
#[derive(Clone)]
pub struct DeletionQueue {
client: DeletionQueueClient,
}
#[derive(Debug)]
struct FlushOp {
tx: tokio::sync::oneshot::Sender<()>,
}
impl FlushOp {
fn fire(self) {
if self.tx.send(()).is_err() {
// oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush.
debug!("deletion queue flush from dropped client");
};
}
}
#[derive(Clone)]
pub struct DeletionQueueClient {
tx: tokio::sync::mpsc::Sender<FrontendQueueMessage>,
executor_tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
}
#[derive(Debug, Serialize, Deserialize)]
struct TenantDeletionList {
/// For each Timeline, a list of key fragments to append to the timeline remote path
/// when reconstructing a full key
timelines: HashMap<TimelineId, Vec<String>>,
/// The generation in which this deletion was emitted: note that this may not be the
/// same as the generation of any layers being deleted. The generation of the layer
/// has already been absorbed into the keys in `objects`
generation: Generation,
}
#[serde_as]
#[derive(Debug, Serialize, Deserialize)]
struct DeletionList {
/// Serialization version, for future use
version: u8,
/// Used for constructing a unique key for each deletion list we write out.
sequence: u64,
/// To avoid repeating tenant/timeline IDs in every key, we store keys in
/// nested HashMaps by TenantTimelineID. Each Tenant only appears once
/// with one unique generation ID: if someone tries to push a second generation
/// ID for the same tenant, we will start a new DeletionList.
tenants: HashMap<TenantId, TenantDeletionList>,
/// Avoid having to walk `tenants` to calculate size
size: usize,
}
#[serde_as]
#[derive(Debug, Serialize, Deserialize)]
struct DeletionHeader {
/// Serialization version, for future use
version: u8,
/// Enable determining the next sequence number even if there are no deletion lists present.
/// If there _are_ deletion lists present, then their sequence numbers take precedence over
/// this.
last_deleted_list_seq: u64,
// TODO: this is where we will track a 'clean' sequence number that indicates all deletion
// lists <= that sequence have had their generations validated with the control plane
// and are OK to execute.
}
impl DeletionHeader {
const VERSION_LATEST: u8 = 1;
fn new(last_deleted_list_seq: u64) -> Self {
Self {
version: Self::VERSION_LATEST,
last_deleted_list_seq,
}
}
}
impl DeletionList {
const VERSION_LATEST: u8 = 1;
fn new(sequence: u64) -> Self {
Self {
version: Self::VERSION_LATEST,
sequence,
tenants: HashMap::new(),
size: 0,
}
}
fn drain(&mut self) -> Self {
let mut tenants = HashMap::new();
std::mem::swap(&mut self.tenants, &mut tenants);
let other = Self {
version: Self::VERSION_LATEST,
sequence: self.sequence,
tenants,
size: self.size,
};
self.size = 0;
other
}
fn is_empty(&self) -> bool {
self.tenants.is_empty()
}
fn len(&self) -> usize {
self.size
}
/// Returns true if the push was accepted, false if the caller must start a new
/// deletion list.
fn push(
&mut self,
tenant: &TenantId,
timeline: &TimelineId,
generation: Generation,
objects: &mut Vec<RemotePath>,
) -> bool {
if objects.is_empty() {
// Avoid inserting an empty TimelineDeletionList: this preserves the property
// that if we have no keys, then self.objects is empty (used in Self::is_empty)
return true;
}
let tenant_entry = self
.tenants
.entry(*tenant)
.or_insert_with(|| TenantDeletionList {
timelines: HashMap::new(),
generation: generation,
});
if tenant_entry.generation != generation {
// Only one generation per tenant per list: signal to
// caller to start a new list.
return false;
}
let timeline_entry = tenant_entry
.timelines
.entry(*timeline)
.or_insert_with(|| Vec::new());
let timeline_remote_path = remote_timeline_path(tenant, timeline);
self.size += objects.len();
timeline_entry.extend(objects.drain(..).map(|p| {
p.strip_prefix(&timeline_remote_path)
.expect("Timeline paths always start with the timeline prefix")
.to_string_lossy()
.to_string()
}));
true
}
fn take_paths(self) -> Vec<RemotePath> {
let mut result = Vec::new();
for (tenant, tenant_deletions) in self.tenants.into_iter() {
for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() {
let timeline_remote_path = remote_timeline_path(&tenant, &timeline);
result.extend(
timeline_layers
.into_iter()
.map(|l| timeline_remote_path.join(&PathBuf::from(l))),
);
}
}
result
}
}
#[derive(Error, Debug)]
pub enum DeletionQueueError {
#[error("Deletion queue unavailable during shutdown")]
ShuttingDown,
}
impl DeletionQueueClient {
async fn do_push(&self, msg: FrontendQueueMessage) -> Result<(), DeletionQueueError> {
match self.tx.send(msg).await {
Ok(_) => Ok(()),
Err(e) => {
// This shouldn't happen, we should shut down all tenants before
// we shut down the global delete queue. If we encounter a bug like this,
// we may leak objects as deletions won't be processed.
error!("Deletion queue closed while pushing, shutting down? ({e})");
Err(DeletionQueueError::ShuttingDown)
}
}
}
/// Submit a list of layers for deletion: this function will return before the deletion is
/// persistent, but it may be executed at any time after this function enters: do not push
/// layers until you're sure they can be deleted safely (i.e. remote metadata no longer
/// references them).
pub(crate) async fn push_layers(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
generation: Generation,
layers: Vec<(LayerFileName, Generation)>,
) -> Result<(), DeletionQueueError> {
DELETION_QUEUE_SUBMITTED.inc_by(layers.len() as u64);
self.do_push(FrontendQueueMessage::Delete(DeletionOp {
tenant_id,
timeline_id,
layers,
generation,
objects: Vec::new(),
}))
.await
}
async fn do_flush(
&self,
msg: FrontendQueueMessage,
rx: tokio::sync::oneshot::Receiver<()>,
) -> Result<(), DeletionQueueError> {
self.do_push(msg).await?;
if rx.await.is_err() {
// This shouldn't happen if tenants are shut down before deletion queue. If we
// encounter a bug like this, then a flusher will incorrectly believe it has flushed
// when it hasn't, possibly leading to leaking objects.
error!("Deletion queue dropped flush op while client was still waiting");
Err(DeletionQueueError::ShuttingDown)
} else {
Ok(())
}
}
/// Wait until all previous deletions are persistent (either executed, or written to a DeletionList)
pub async fn flush(&self) -> Result<(), DeletionQueueError> {
let (tx, rx) = tokio::sync::oneshot::channel::<()>();
self.do_flush(FrontendQueueMessage::Flush(FlushOp { tx }), rx)
.await
}
// Wait until all previous deletions are executed
pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
debug!("flush_execute: flushing to deletion lists...");
// Flush any buffered work to deletion lists
self.flush().await?;
// Flush execution of deletion lists
let (tx, rx) = tokio::sync::oneshot::channel::<()>();
debug!("flush_execute: flushing execution...");
self.do_flush(FrontendQueueMessage::FlushExecute(FlushOp { tx }), rx)
.await?;
debug!("flush_execute: finished flushing execution...");
Ok(())
}
/// This interface bypasses the persistent deletion queue, and any validation
/// that this pageserver is still elegible to execute the deletions. It is for
/// use in timeline deletions, where the control plane is telling us we may
/// delete everything in the timeline.
///
/// DO NOT USE THIS FROM GC OR COMPACTION CODE. Use the regular `push_layers`.
pub(crate) async fn push_immediate(
&self,
objects: Vec<RemotePath>,
) -> Result<(), DeletionQueueError> {
self.executor_tx
.send(ExecutorMessage::Delete(objects))
.await
.map_err(|_| DeletionQueueError::ShuttingDown)
}
/// Companion to push_immediate. When this returns Ok, all prior objects sent
/// into push_immediate have been deleted from remote storage.
pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> {
let (tx, rx) = tokio::sync::oneshot::channel::<()>();
self.executor_tx
.send(ExecutorMessage::Flush(FlushOp { tx }))
.await
.map_err(|_| DeletionQueueError::ShuttingDown)?;
rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
}
}
impl DeletionQueue {
pub fn new_client(&self) -> DeletionQueueClient {
self.client.clone()
}
/// Caller may use the returned object to construct clients with new_client.
/// Caller should tokio::spawn the background() members of the two worker objects returned:
/// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
///
/// If remote_storage is None, then the returned workers will also be None.
pub fn new(
remote_storage: Option<GenericRemoteStorage>,
conf: &'static PageServerConf,
cancel: CancellationToken,
) -> (
Self,
Option<FrontendQueueWorker>,
Option<BackendQueueWorker>,
Option<ExecutorWorker>,
) {
// Deep channel: it consumes deletions from all timelines and we do not want to block them
let (tx, rx) = tokio::sync::mpsc::channel(16384);
// Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
// Shallow channel: it carries lists of paths, and we expect the main queueing to
// happen in the backend (persistent), not in this queue.
let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16);
let remote_storage = match remote_storage {
None => {
return (
Self {
client: DeletionQueueClient { tx, executor_tx },
},
None,
None,
None,
)
}
Some(r) => r,
};
(
Self {
client: DeletionQueueClient {
tx,
executor_tx: executor_tx.clone(),
},
},
Some(FrontendQueueWorker::new(
conf,
rx,
backend_tx,
cancel.clone(),
)),
Some(BackendQueueWorker::new(
conf,
backend_rx,
executor_tx,
cancel.clone(),
)),
Some(ExecutorWorker::new(
remote_storage,
executor_rx,
cancel.clone(),
)),
)
}
}
#[cfg(test)]
mod test {
use hex_literal::hex;
use std::{
io::ErrorKind,
path::{Path, PathBuf},
};
use tracing::info;
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
use tokio::{runtime::EnterGuard, task::JoinHandle};
use crate::tenant::{harness::TenantHarness, remote_timeline_client::remote_timeline_path};
use super::*;
pub const TIMELINE_ID: TimelineId =
TimelineId::from_array(hex!("11223344556677881122334455667788"));
struct TestSetup {
runtime: &'static tokio::runtime::Runtime,
_entered_runtime: EnterGuard<'static>,
harness: TenantHarness,
remote_fs_dir: PathBuf,
storage: GenericRemoteStorage,
deletion_queue: DeletionQueue,
fe_worker: JoinHandle<()>,
be_worker: JoinHandle<()>,
ex_worker: JoinHandle<()>,
}
impl TestSetup {
/// Simulate a pageserver restart by destroying and recreating the deletion queue
fn restart(&mut self) {
let (deletion_queue, fe_worker, be_worker, ex_worker) = DeletionQueue::new(
Some(self.storage.clone()),
self.harness.conf,
CancellationToken::new(),
);
self.deletion_queue = deletion_queue;
let mut fe_worker = fe_worker.unwrap();
let mut be_worker = be_worker.unwrap();
let mut ex_worker = ex_worker.unwrap();
let mut fe_worker = self
.runtime
.spawn(async move { fe_worker.background().await });
let mut be_worker = self
.runtime
.spawn(async move { be_worker.background().await });
let mut ex_worker = self.runtime.spawn(async move {
drop(ex_worker.background().await);
});
std::mem::swap(&mut self.fe_worker, &mut fe_worker);
std::mem::swap(&mut self.be_worker, &mut be_worker);
std::mem::swap(&mut self.ex_worker, &mut ex_worker);
// Join the old workers
self.runtime.block_on(fe_worker).unwrap();
self.runtime.block_on(be_worker).unwrap();
self.runtime.block_on(ex_worker).unwrap();
}
}
fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
let harness = TenantHarness::create(test_name)?;
// We do not load() the harness: we only need its config and remote_storage
// Set up a GenericRemoteStorage targetting a directory
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
std::fs::create_dir_all(remote_fs_dir)?;
let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
let storage_config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
)
.unwrap(),
max_sync_errors: std::num::NonZeroU32::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
)
.unwrap(),
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
let runtime = Box::leak(Box::new(
tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?,
));
let entered_runtime = runtime.enter();
let (deletion_queue, fe_worker, be_worker, ex_worker) = DeletionQueue::new(
Some(storage.clone()),
harness.conf,
CancellationToken::new(),
);
let mut fe_worker = fe_worker.unwrap();
let mut be_worker = be_worker.unwrap();
let mut ex_worker = ex_worker.unwrap();
let fe_worker_join = runtime.spawn(async move { fe_worker.background().await });
let be_worker_join = runtime.spawn(async move { be_worker.background().await });
let ex_worker_join = runtime.spawn(async move {
drop(ex_worker.background().await);
});
Ok(TestSetup {
runtime,
_entered_runtime: entered_runtime,
harness,
remote_fs_dir,
storage,
deletion_queue,
fe_worker: fe_worker_join,
be_worker: be_worker_join,
ex_worker: ex_worker_join,
})
}
// TODO: put this in a common location so that we can share with remote_timeline_client's tests
fn assert_remote_files(expected: &[&str], remote_path: &Path) {
let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
expected.sort();
let mut found: Vec<String> = Vec::new();
let dir = match std::fs::read_dir(remote_path) {
Ok(d) => d,
Err(e) => {
if e.kind() == ErrorKind::NotFound {
if expected.is_empty() {
// We are asserting prefix is empty: it is expected that the dir is missing
return;
} else {
assert_eq!(expected, Vec::<String>::new());
unreachable!();
}
} else {
panic!(
"Unexpected error listing {0}: {e}",
remote_path.to_string_lossy()
);
}
}
};
for entry in dir.flatten() {
let entry_name = entry.file_name();
let fname = entry_name.to_str().unwrap();
found.push(String::from(fname));
}
found.sort();
assert_eq!(expected, found);
}
fn assert_local_files(expected: &[&str], directory: &Path) {
let mut dir = match std::fs::read_dir(directory) {
Ok(d) => d,
Err(_) => {
assert_eq!(expected, &Vec::<String>::new());
return;
}
};
let mut found = Vec::new();
while let Some(dentry) = dir.next() {
let dentry = dentry.unwrap();
let file_name = dentry.file_name();
let file_name_str = file_name.to_string_lossy();
found.push(file_name_str.to_string());
}
found.sort();
assert_eq!(expected, found);
}
#[test]
fn deletion_queue_smoke() -> anyhow::Result<()> {
// Basic test that the deletion queue processes the deletions we pass into it
let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
let client = ctx.deletion_queue.new_client();
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let tenant_id = ctx.harness.tenant_id;
let content: Vec<u8> = "victim1 contents".into();
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
let deletion_prefix = ctx.harness.conf.deletion_prefix();
// Exercise the distinction between the generation of the layers
// we delete, and the generation of the running Tenant.
let layer_generation = Generation::new(0xdeadbeef);
let now_generation = Generation::new(0xfeedbeef);
let remote_layer_file_name_1 =
format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
// Inject a victim file to remote storage
info!("Writing");
std::fs::create_dir_all(&remote_timeline_path)?;
std::fs::write(
remote_timeline_path.join(remote_layer_file_name_1.clone()),
content,
)?;
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
// File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
info!("Pushing");
ctx.runtime.block_on(client.push_layers(
tenant_id,
TIMELINE_ID,
now_generation,
[(layer_file_name_1.clone(), layer_generation)].to_vec(),
))?;
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
assert_local_files(&[], &deletion_prefix);
// File should still be there after we write a deletion list (we haven't pushed enough to execute anything)
info!("Flushing");
ctx.runtime.block_on(client.flush())?;
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
// File should go away when we execute
info!("Flush-executing");
ctx.runtime.block_on(client.flush_execute())?;
assert_remote_files(&[], &remote_timeline_path);
assert_local_files(&["header-01"], &deletion_prefix);
// Flushing on an empty queue should succeed immediately, and not write any lists
info!("Flush-executing on empty");
ctx.runtime.block_on(client.flush_execute())?;
assert_local_files(&["header-01"], &deletion_prefix);
Ok(())
}
#[test]
fn deletion_queue_recovery() -> anyhow::Result<()> {
// Basic test that the deletion queue processes the deletions we pass into it
let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
let client = ctx.deletion_queue.new_client();
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let tenant_id = ctx.harness.tenant_id;
let content: Vec<u8> = "victim1 contents".into();
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
let deletion_prefix = ctx.harness.conf.deletion_prefix();
let layer_generation = Generation::new(0xdeadbeef);
let now_generation = Generation::new(0xfeedbeef);
let remote_layer_file_name_1 =
format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
// Inject a file, delete it, and flush to a deletion list
std::fs::create_dir_all(&remote_timeline_path)?;
std::fs::write(
remote_timeline_path.join(remote_layer_file_name_1.clone()),
content,
)?;
ctx.runtime.block_on(client.push_layers(
tenant_id,
TIMELINE_ID,
now_generation,
[(layer_file_name_1.clone(), layer_generation)].to_vec(),
))?;
ctx.runtime.block_on(client.flush())?;
assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
// Restart the deletion queue
drop(client);
ctx.restart();
let client = ctx.deletion_queue.new_client();
// If we have recovered the deletion list properly, then executing after restart should purge it
info!("Flush-executing");
ctx.runtime.block_on(client.flush_execute())?;
assert_remote_files(&[], &remote_timeline_path);
assert_local_files(&["header-01"], &deletion_prefix);
Ok(())
}
}
/// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence
/// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it.
#[cfg(test)]
pub mod mock {
use tracing::info;
use crate::tenant::remote_timeline_client::remote_layer_path;
use super::*;
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc,
};
pub struct MockDeletionQueue {
tx: tokio::sync::mpsc::Sender<FrontendQueueMessage>,
executor_tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
tx_pump: tokio::sync::mpsc::Sender<FlushOp>,
executed: Arc<AtomicUsize>,
}
impl MockDeletionQueue {
pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
let (tx, mut rx) = tokio::sync::mpsc::channel(16384);
let (tx_pump, mut rx_pump) = tokio::sync::mpsc::channel::<FlushOp>(1);
let (executor_tx, mut executor_rx) = tokio::sync::mpsc::channel(16384);
let executed = Arc::new(AtomicUsize::new(0));
let executed_bg = executed.clone();
tokio::spawn(async move {
let remote_storage = match &remote_storage {
Some(rs) => rs,
None => {
info!("No remote storage configured, deletion queue will not run");
return;
}
};
info!("Running mock deletion queue");
// Each time we are asked to pump, drain the queue of deletions
while let Some(flush_op) = rx_pump.recv().await {
info!("Executing all pending deletions");
// Transform all executor messages to generic frontend messages
while let Ok(msg) = executor_rx.try_recv() {
match msg {
ExecutorMessage::Delete(objects) => {
for path in objects {
match remote_storage.delete(&path).await {
Ok(_) => {
debug!("Deleted {path}");
}
Err(e) => {
error!(
"Failed to delete {path}, leaking object! ({e})"
);
}
}
executed_bg.fetch_add(1, Ordering::Relaxed);
}
}
ExecutorMessage::Flush(flush_op) => {
flush_op.fire();
}
}
}
while let Ok(msg) = rx.try_recv() {
match msg {
FrontendQueueMessage::Delete(op) => {
let mut objects = op.objects;
for (layer, generation) in op.layers {
objects.push(remote_layer_path(
&op.tenant_id,
&op.timeline_id,
&layer,
generation,
));
}
for path in objects {
info!("Executing deletion {path}");
match remote_storage.delete(&path).await {
Ok(_) => {
debug!("Deleted {path}");
}
Err(e) => {
error!(
"Failed to delete {path}, leaking object! ({e})"
);
}
}
executed_bg.fetch_add(1, Ordering::Relaxed);
}
}
FrontendQueueMessage::Flush(op) => {
op.fire();
}
FrontendQueueMessage::FlushExecute(op) => {
// We have already executed all prior deletions because mock does them inline
op.fire();
}
}
info!("All pending deletions have been executed");
}
flush_op
.tx
.send(())
.expect("Test called flush but dropped before finishing");
}
});
Self {
tx,
tx_pump,
executor_tx,
executed,
}
}
pub fn get_executed(&self) -> usize {
self.executed.load(Ordering::Relaxed)
}
pub async fn pump(&self) {
let (tx, rx) = tokio::sync::oneshot::channel();
self.tx_pump
.send(FlushOp { tx })
.await
.expect("pump called after deletion queue loop stopped");
rx.await
.expect("Mock delete queue shutdown while waiting to pump");
}
pub(crate) fn new_client(&self) -> DeletionQueueClient {
DeletionQueueClient {
tx: self.tx.clone(),
executor_tx: self.executor_tx.clone(),
}
}
}
}

View File

@@ -0,0 +1,300 @@
use std::collections::HashMap;
use std::time::Duration;
use futures::future::TryFutureExt;
use pageserver_api::control_api::HexTenantId;
use pageserver_api::control_api::{ValidateRequest, ValidateRequestTenant, ValidateResponse};
use serde::de::DeserializeOwned;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use tracing::info;
use tracing::warn;
use utils::backoff;
use crate::config::PageServerConf;
use crate::metrics::DELETION_QUEUE_ERRORS;
use super::executor::ExecutorMessage;
use super::DeletionHeader;
use super::DeletionList;
use super::DeletionQueueError;
use super::FlushOp;
// After this length of time, execute deletions which are elegible to run,
// even if we haven't accumulated enough for a full-sized DeleteObjects
const EXECUTE_IDLE_DEADLINE: Duration = Duration::from_secs(60);
// If we have received this number of keys, proceed with attempting to execute
const AUTOFLUSH_KEY_COUNT: usize = 16384;
#[derive(Debug)]
pub(super) enum BackendQueueMessage {
Delete(DeletionList),
Flush(FlushOp),
}
pub struct BackendQueueWorker {
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<BackendQueueMessage>,
tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
// Accumulate some lists to execute in a batch.
// The purpose of this accumulation is to implement batched validation of
// attachment generations, when split-brain protection is implemented.
// (see https://github.com/neondatabase/neon/pull/4919)
pending_lists: Vec<DeletionList>,
// Sum of all the lengths of lists in pending_lists
pending_key_count: usize,
// DeletionLists we have fully executed, which may be deleted
// from remote storage.
executed_lists: Vec<DeletionList>,
cancel: CancellationToken,
}
#[derive(thiserror::Error, Debug)]
enum ValidateCallError {
#[error("shutdown")]
Shutdown,
#[error("remote: {0}")]
Remote(reqwest::Error),
}
async fn retry_http_forever<T>(
url: &url::Url,
request: ValidateRequest,
cancel: CancellationToken,
) -> Result<T, DeletionQueueError>
where
T: DeserializeOwned,
{
let client = reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client");
let response = match backoff::retry(
|| {
client
.post(url.clone())
.json(&request)
.send()
.map_err(|e| ValidateCallError::Remote(e))
},
|_| false,
3,
u32::MAX,
"calling control plane generation validation API",
backoff::Cancel::new(cancel.clone(), || ValidateCallError::Shutdown),
)
.await
{
Err(ValidateCallError::Shutdown) => {
return Err(DeletionQueueError::ShuttingDown);
}
Err(ValidateCallError::Remote(_)) => {
panic!("We retry forever");
}
Ok(r) => r,
};
// TODO: handle non-200 response
// TODO: handle decode error
Ok(response.json::<T>().await.unwrap())
}
impl BackendQueueWorker {
pub(super) fn new(
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<BackendQueueMessage>,
tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
cancel: CancellationToken,
) -> Self {
Self {
conf,
rx,
tx,
pending_lists: Vec::new(),
pending_key_count: 0,
executed_lists: Vec::new(),
cancel,
}
}
async fn cleanup_lists(&mut self) {
debug!(
"cleanup_lists: {0} executed lists, {1} pending lists",
self.executed_lists.len(),
self.pending_lists.len()
);
// Lists are always pushed into the queues + executed list in sequence order, so
// no sort is required: can find the highest sequence number by peeking at last element
let max_executed_seq = match self.executed_lists.last() {
Some(v) => v.sequence,
None => {
// No executed lists, nothing to clean up.
return;
}
};
// In case this is the last list, write a header out first so that
// we don't risk losing our knowledge of the sequence number (on replay, our
// next sequence number is the highest list seen + 1, or read from the header
// if there are no lists)
let header = DeletionHeader::new(max_executed_seq);
debug!("Writing header {:?}", header);
let header_bytes =
serde_json::to_vec(&header).expect("Failed to serialize deletion header");
let header_path = self.conf.deletion_header_path();
if let Err(e) = tokio::fs::write(&header_path, header_bytes).await {
warn!("Failed to upload deletion queue header: {e:#}");
DELETION_QUEUE_ERRORS
.with_label_values(&["put_header"])
.inc();
return;
}
while let Some(list) = self.executed_lists.pop() {
let list_path = self.conf.deletion_list_path(list.sequence);
if let Err(e) = tokio::fs::remove_file(&list_path).await {
// Unexpected: we should have permissions and nothing else should
// be touching these files
tracing::error!("Failed to delete {0}: {e:#}", list_path.display());
self.executed_lists.push(list);
break;
}
}
}
pub async fn validate_lists(&mut self) -> Result<(), DeletionQueueError> {
let control_plane_api = match &self.conf.control_plane_api {
None => {
// Generations are not switched on yet.
return Ok(());
}
Some(api) => api,
};
let validate_path = control_plane_api
.join("validate")
.expect("Failed to build validate path");
for list in &mut self.pending_lists {
let request = ValidateRequest {
tenants: list
.tenants
.iter()
.map(|(tid, tdl)| ValidateRequestTenant {
id: HexTenantId::new(*tid),
gen: tdl.generation.into().expect(
"Generation should always be valid for a Tenant doing deletions",
),
})
.collect(),
};
// Retry forever, we cannot make progress until we get a response
let response: ValidateResponse =
retry_http_forever(&validate_path, request, self.cancel.clone()).await?;
let tenants_valid: HashMap<_, _> = response
.tenants
.into_iter()
.map(|t| (t.id.take(), t.valid))
.collect();
// Filter the list based on whether the server responded valid: true.
// If a tenant is omitted in the response, it has been deleted, and we should
// proceed with deletion.
list.tenants.retain(|tenant_id, _tenant| {
let r = tenants_valid.get(tenant_id).map(|v| *v).unwrap_or(true);
if !r {
warn!("Dropping stale deletions for tenant {tenant_id}, objects may be leaked");
}
r
});
}
Ok(())
}
pub async fn flush(&mut self) {
// Issue any required generation validation calls to the control plane
if let Err(DeletionQueueError::ShuttingDown) = self.validate_lists().await {
warn!("Shutting down");
return;
}
// Submit all keys from pending DeletionLists into the executor
for list in self.pending_lists.drain(..) {
let objects = list.take_paths();
if let Err(_e) = self.tx.send(ExecutorMessage::Delete(objects)).await {
warn!("Shutting down");
return;
};
}
// Flush the executor to ensure all the operations we just submitted have been executed
let (tx, rx) = tokio::sync::oneshot::channel::<()>();
let flush_op = FlushOp { tx };
if let Err(_e) = self.tx.send(ExecutorMessage::Flush(flush_op)).await {
warn!("Shutting down");
return;
};
if rx.await.is_err() {
warn!("Shutting down");
return;
}
// After flush, we are assured that all contents of the pending lists
// are executed
self.pending_key_count = 0;
self.executed_lists.append(&mut self.pending_lists);
// Erase the lists we executed
self.cleanup_lists().await;
}
pub async fn background(&mut self) {
// TODO: if we would like to be able to defer deletions while a Layer still has
// refs (but it will be elegible for deletion after process ends), then we may
// add an ephemeral part to BackendQueueMessage::Delete that tracks which keys
// in the deletion list may not be deleted yet, with guards to block on while
// we wait to proceed.
loop {
let msg = match tokio::time::timeout(EXECUTE_IDLE_DEADLINE, self.rx.recv()).await {
Ok(Some(m)) => m,
Ok(None) => {
// All queue senders closed
info!("Shutting down");
break;
}
Err(_) => {
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
// return immediately if no work is pending
self.flush().await;
continue;
}
};
match msg {
BackendQueueMessage::Delete(list) => {
self.pending_key_count += list.len();
self.pending_lists.push(list);
if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
self.flush().await;
}
}
BackendQueueMessage::Flush(op) => {
self.flush().await;
op.fire();
}
}
}
}
}

View File

@@ -0,0 +1,143 @@
use remote_storage::GenericRemoteStorage;
use remote_storage::RemotePath;
use remote_storage::MAX_KEYS_PER_DELETE;
use std::time::Duration;
use tokio_util::sync::CancellationToken;
use tracing::info;
use tracing::warn;
use crate::metrics::DELETION_QUEUE_ERRORS;
use crate::metrics::DELETION_QUEUE_EXECUTED;
use super::DeletionQueueError;
use super::FlushOp;
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
pub(super) enum ExecutorMessage {
Delete(Vec<RemotePath>),
Flush(FlushOp),
}
/// Non-persistent deletion queue, for coalescing multiple object deletes into
/// larger DeleteObjects requests.
pub struct ExecutorWorker {
// Accumulate up to 1000 keys for the next deletion operation
accumulator: Vec<RemotePath>,
rx: tokio::sync::mpsc::Receiver<ExecutorMessage>,
cancel: CancellationToken,
remote_storage: GenericRemoteStorage,
}
impl ExecutorWorker {
pub(super) fn new(
remote_storage: GenericRemoteStorage,
rx: tokio::sync::mpsc::Receiver<ExecutorMessage>,
cancel: CancellationToken,
) -> Self {
Self {
remote_storage,
rx,
cancel,
accumulator: Vec::new(),
}
}
/// Wrap the remote `delete_objects` with a failpoint
pub async fn remote_delete(&self) -> Result<(), anyhow::Error> {
fail::fail_point!("deletion-queue-before-execute", |_| {
info!("Skipping execution, failpoint set");
DELETION_QUEUE_ERRORS
.with_label_values(&["failpoint"])
.inc();
Err(anyhow::anyhow!("failpoint hit"))
});
self.remote_storage.delete_objects(&self.accumulator).await
}
/// Block until everything in accumulator has been executed
pub async fn flush(&mut self) -> Result<(), DeletionQueueError> {
while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
match self.remote_delete().await {
Ok(()) => {
// Note: we assume that the remote storage layer returns Ok(()) if some
// or all of the deleted objects were already gone.
DELETION_QUEUE_EXECUTED.inc_by(self.accumulator.len() as u64);
info!(
"Executed deletion batch {}..{}",
self.accumulator
.first()
.expect("accumulator should be non-empty"),
self.accumulator
.last()
.expect("accumulator should be non-empty"),
);
self.accumulator.clear();
}
Err(e) => {
warn!("DeleteObjects request failed: {e:#}, will retry");
DELETION_QUEUE_ERRORS.with_label_values(&["execute"]).inc();
}
};
}
if self.cancel.is_cancelled() {
// Expose an error because we may not have actually flushed everything
Err(DeletionQueueError::ShuttingDown)
} else {
Ok(())
}
}
pub async fn background(&mut self) -> Result<(), DeletionQueueError> {
self.accumulator.reserve(MAX_KEYS_PER_DELETE);
loop {
if self.cancel.is_cancelled() {
return Err(DeletionQueueError::ShuttingDown);
}
let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
Ok(Some(m)) => m,
Ok(None) => {
// All queue senders closed
info!("Shutting down");
return Err(DeletionQueueError::ShuttingDown);
}
Err(_) => {
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
// return immediately if no work is pending
self.flush().await?;
continue;
}
};
match msg {
ExecutorMessage::Delete(mut list) => {
while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
if self.accumulator.len() == MAX_KEYS_PER_DELETE {
self.flush().await?;
// If we have received this number of keys, proceed with attempting to execute
assert_eq!(self.accumulator.len(), 0);
}
let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
let take_count = std::cmp::min(available_slots, list.len());
for path in list.drain(list.len() - take_count..) {
self.accumulator.push(path);
}
}
}
ExecutorMessage::Flush(flush_op) => {
// If flush() errors, we drop the flush_op and the caller will get
// an error recv()'ing their oneshot channel.
self.flush().await?;
flush_op.fire();
}
}
}
}
}

View File

@@ -0,0 +1,376 @@
use super::BackendQueueMessage;
use super::DeletionHeader;
use super::DeletionList;
use super::FlushOp;
use std::fs::create_dir_all;
use std::time::Duration;
use regex::Regex;
use remote_storage::RemotePath;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use tracing::info;
use tracing::warn;
use utils::generation::Generation;
use utils::id::TenantId;
use utils::id::TimelineId;
use crate::config::PageServerConf;
use crate::metrics::DELETION_QUEUE_ERRORS;
use crate::metrics::DELETION_QUEUE_SUBMITTED;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::storage_layer::LayerFileName;
// The number of keys in a DeletionList before we will proactively persist it
// (without reaching a flush deadline). This aims to deliver objects of the order
// of magnitude 1MB when we are under heavy delete load.
const DELETION_LIST_TARGET_SIZE: usize = 16384;
// Ordinarily, we only flush to DeletionList periodically, to bound the window during
// which we might leak objects from not flushing a DeletionList after
// the objects are already unlinked from timeline metadata.
const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
// more objects before doing the flush.
const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
#[derive(Debug)]
pub(super) struct DeletionOp {
pub(super) tenant_id: TenantId,
pub(super) timeline_id: TimelineId,
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
// have a config object handy to project it to a remote key, and need the consuming worker
// to do it for you.
pub(super) layers: Vec<(LayerFileName, Generation)>,
pub(super) objects: Vec<RemotePath>,
/// The _current_ generation of the Tenant attachment in which we are enqueuing
/// this deletion.
pub(super) generation: Generation,
}
#[derive(Debug)]
pub(super) enum FrontendQueueMessage {
Delete(DeletionOp),
// Wait until all prior deletions make it into a persistent DeletionList
Flush(FlushOp),
// Wait until all prior deletions have been executed (i.e. objects are actually deleted)
FlushExecute(FlushOp),
}
pub struct FrontendQueueWorker {
conf: &'static PageServerConf,
// Incoming frontend requests to delete some keys
rx: tokio::sync::mpsc::Receiver<FrontendQueueMessage>,
// Outbound requests to the backend to execute deletion lists we have composed.
tx: tokio::sync::mpsc::Sender<BackendQueueMessage>,
// The list we are currently building, contains a buffer of keys to delete
// and our next sequence number
pending: DeletionList,
// These FlushOps should fire the next time we flush
pending_flushes: Vec<FlushOp>,
// Worker loop is torn down when this fires.
cancel: CancellationToken,
}
impl FrontendQueueWorker {
pub(super) fn new(
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<FrontendQueueMessage>,
tx: tokio::sync::mpsc::Sender<BackendQueueMessage>,
cancel: CancellationToken,
) -> Self {
Self {
pending: DeletionList::new(1),
conf,
rx,
tx,
pending_flushes: Vec::new(),
cancel,
}
}
async fn upload_pending_list(&mut self) -> anyhow::Result<()> {
let path = self.conf.deletion_list_path(self.pending.sequence);
let bytes = serde_json::to_vec(&self.pending).expect("Failed to serialize deletion list");
tokio::fs::write(&path, &bytes).await?;
tokio::fs::File::open(&path).await?.sync_all().await?;
Ok(())
}
/// Try to flush `list` to persistent storage
///
/// This does not return errors, because on failure to flush we do not lose
/// any state: flushing will be retried implicitly on the next deadline
async fn flush(&mut self) {
if self.pending.is_empty() {
for f in self.pending_flushes.drain(..) {
f.fire();
}
return;
}
match self.upload_pending_list().await {
Ok(_) => {
info!(sequence = self.pending.sequence, "Stored deletion list");
for f in self.pending_flushes.drain(..) {
f.fire();
}
let onward_list = self.pending.drain();
// We have consumed out of pending: reset it for the next incoming deletions to accumulate there
self.pending = DeletionList::new(self.pending.sequence + 1);
if let Err(e) = self.tx.send(BackendQueueMessage::Delete(onward_list)).await {
// This is allowed to fail: it will only happen if the backend worker is shut down,
// so we can just drop this on the floor.
info!("Deletion list dropped, this is normal during shutdown ({e:#})");
}
}
Err(e) => {
DELETION_QUEUE_ERRORS.with_label_values(&["put_list"]).inc();
warn!(
sequence = self.pending.sequence,
"Failed to write deletion list to remote storage, will retry later ({e:#})"
);
}
}
}
async fn recover(&mut self) -> Result<(), anyhow::Error> {
// Load header: this is not required to be present, e.g. when a pageserver first runs
let header_path = self.conf.deletion_header_path();
// Synchronous, but we only do it once per process lifetime so it's tolerable
create_dir_all(&self.conf.deletion_prefix())?;
let header_bytes = match tokio::fs::read(&header_path).await {
Ok(h) => Ok(Some(h)),
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
debug!(
"Deletion header {0} not found, first start?",
header_path.display()
);
Ok(None)
} else {
Err(e)
}
}
}?;
if let Some(header_bytes) = header_bytes {
if let Some(header) = match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
Ok(h) => Some(h),
Err(e) => {
warn!(
"Failed to deserialize deletion header, ignoring {0}: {e:#}",
header_path.display()
);
// This should never happen unless we make a mistake with our serialization.
// Ignoring a deletion header is not consequential for correctnes because all deletions
// are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
None
}
} {
self.pending.sequence =
std::cmp::max(self.pending.sequence, header.last_deleted_list_seq + 1);
};
};
let mut dir = match tokio::fs::read_dir(&self.conf.deletion_prefix()).await {
Ok(d) => d,
Err(e) => {
warn!(
"Failed to open deletion list directory {0}: {e:#}",
header_path.display()
);
// Give up: if we can't read the deletion list directory, we probably can't
// write lists into it later, so the queue won't work.
return Err(e.into());
}
};
let list_name_pattern = Regex::new("([a-zA-Z0-9]{16})-([a-zA-Z0-9]{2}).list").unwrap();
let mut seqs: Vec<u64> = Vec::new();
while let Some(dentry) = dir.next_entry().await? {
let file_name = dentry.file_name().to_owned();
let basename = file_name.to_string_lossy();
let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
m.get(1)
.expect("Non optional group should be present")
.as_str()
} else {
warn!("Unexpected key in deletion queue: {basename}");
continue;
};
let seq: u64 = match u64::from_str_radix(seq_part, 16) {
Ok(s) => s,
Err(e) => {
warn!("Malformed key '{basename}': {e}");
continue;
}
};
seqs.push(seq);
}
seqs.sort();
// Initialize the next sequence number in the frontend based on the maximum of the highest list we see,
// and the last list that was deleted according to the header. Combined with writing out the header
// prior to deletions, this guarnatees no re-use of sequence numbers.
if let Some(max_list_seq) = seqs.last() {
self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
}
for s in seqs {
let list_path = self.conf.deletion_list_path(s);
let list_bytes = tokio::fs::read(&list_path).await?;
let deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
Ok(l) => l,
Err(e) => {
// Drop the list on the floor: any objects it referenced will be left behind
// for scrubbing to clean up. This should never happen unless we have a serialization bug.
warn!(sequence = s, "Failed to deserialize deletion list: {e}");
continue;
}
};
// We will drop out of recovery if this fails: it indicates that we are shutting down
// or the backend has panicked
DELETION_QUEUE_SUBMITTED.inc_by(deletion_list.len() as u64);
self.tx
.send(BackendQueueMessage::Delete(deletion_list))
.await?;
}
info!(next_sequence = self.pending.sequence, "Replay complete");
Ok(())
}
/// This is the front-end ingest, where we bundle up deletion requests into DeletionList
/// and write them out, for later
pub async fn background(&mut self) {
info!("Started deletion frontend worker");
let mut recovered: bool = false;
while !self.cancel.is_cancelled() {
let timeout = if self.pending_flushes.is_empty() {
FRONTEND_DEFAULT_TIMEOUT
} else {
FRONTEND_FLUSHING_TIMEOUT
};
let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
Ok(Some(msg)) => msg,
Ok(None) => {
// Queue sender destroyed, shutting down
break;
}
Err(_) => {
// Hit deadline, flush.
self.flush().await;
continue;
}
};
// On first message, do recovery. This avoids unnecessary recovery very
// early in startup, and simplifies testing by avoiding a 404 reading the
// header on every first pageserver startup.
if !recovered {
// Before accepting any input from this pageserver lifetime, recover all deletion lists that are in S3
if let Err(e) = self.recover().await {
// This should only happen in truly unrecoverable cases, like the recovery finding that the backend
// queue receiver has been dropped.
info!("Deletion queue recover aborted, deletion queue will not proceed ({e})");
return;
} else {
recovered = true;
}
}
match msg {
FrontendQueueMessage::Delete(op) => {
debug!(
"Delete: ingesting {0} layers, {1} other objects",
op.layers.len(),
op.objects.len()
);
let mut layer_paths = Vec::new();
for (layer, generation) in op.layers {
layer_paths.push(remote_layer_path(
&op.tenant_id,
&op.timeline_id,
&layer,
generation,
));
}
layer_paths.extend(op.objects);
if self.pending.push(
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,
) == false
{
self.flush().await;
let retry = self.pending.push(
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,
);
if retry != true {
// Unexpeted: after we flush, we should have
// drained self.pending, so a conflict on
// generation numbers should be impossible.
tracing::error!(
"Failed to enqueue deletions, leaking objects. This is a bug."
);
}
}
}
FrontendQueueMessage::Flush(op) => {
if self.pending.is_empty() {
// Execute immediately
debug!("Flush: No pending objects, flushing immediately");
op.fire()
} else {
// Execute next time we flush
debug!("Flush: adding to pending flush list for next deadline flush");
self.pending_flushes.push(op);
}
}
FrontendQueueMessage::FlushExecute(op) => {
debug!("FlushExecute: passing through to backend");
// We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
if let Err(e) = self.tx.send(BackendQueueMessage::Flush(op)).await {
info!("Can't flush, shutting down ({e})");
// Caller will get error when their oneshot sender was dropped.
}
}
}
if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
self.flush().await;
}
}
info!("Deletion queue shut down.");
}
}

View File

@@ -52,6 +52,29 @@ paths:
schema:
type: object
/v1/deletion_queue/flush:
parameters:
- name: execute
in: query
required: false
schema:
type: boolean
description:
If true, attempt to execute deletions. If false, just flush deletions to persistent deletion lists.
put:
description: Execute any deletions currently enqueued
security: []
responses:
"200":
description: |
Flush completed: if execute was true, then enqueued deletions have been completed. If execute was false,
then enqueued deletions have been persisted to deletion lists, and may have been completed.
content:
application/json:
schema:
type: object
/v1/tenant/{tenant_id}:
parameters:
- name: tenant_id

View File

@@ -8,10 +8,9 @@ use anyhow::{anyhow, Context, Result};
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::{
DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
};
use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
use remote_storage::GenericRemoteStorage;
use storage_broker::BrokerClientChannel;
use tenant_size_model::{SizeResult, StorageModel};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -24,6 +23,7 @@ use super::models::{
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
};
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::{DeletionQueue, DeletionQueueError};
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
@@ -54,20 +54,22 @@ use utils::{
// Imports only used for testing APIs
use super::models::ConfigureFailpointsRequest;
pub struct State {
struct State {
conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
allowlist_routes: Vec<Uri>,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue: DeletionQueue,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
}
impl State {
pub fn new(
fn new(
conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue: DeletionQueue,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
) -> anyhow::Result<Self> {
@@ -81,6 +83,7 @@ impl State {
allowlist_routes,
remote_storage,
broker_client,
deletion_queue,
disk_usage_eviction_state,
})
}
@@ -285,8 +288,6 @@ async fn build_timeline_info_common(
let state = timeline.current_state();
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
let walreceiver_status = timeline.walreceiver_status();
let info = TimelineInfo {
tenant_id: timeline.tenant_id,
timeline_id: timeline.timeline_id,
@@ -307,8 +308,6 @@ async fn build_timeline_info_common(
pg_version: timeline.pg_version,
state,
walreceiver_status,
};
Ok(info)
}
@@ -490,7 +489,20 @@ async fn tenant_attach_handler(
let state = get_state(&request);
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
let generation = if state.conf.control_plane_api.is_some() {
// If we have been configured with a control plane URI, then generations are
// mandatory, as we will attempt to re-attach on startup.
maybe_body
.as_ref()
.map(|tar| tar.generation)
.flatten()
.map(|g| Generation::new(g))
.ok_or(ApiError::BadRequest(anyhow!(
"generation attribute missing"
)))?
} else {
Generation::none()
};
if let Some(remote_storage) = &state.remote_storage {
mgr::attach_tenant(
@@ -500,6 +512,7 @@ async fn tenant_attach_handler(
tenant_conf,
state.broker_client.clone(),
remote_storage.clone(),
&state.deletion_queue,
&ctx,
)
.instrument(info_span!("tenant_attach", %tenant_id))
@@ -548,7 +561,7 @@ async fn tenant_detach_handler(
}
async fn tenant_load_handler(
mut request: Request<Body>,
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
@@ -556,20 +569,13 @@ async fn tenant_load_handler(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
let state = get_state(&request);
// The /load request is only usable when control_plane_api is not set. Once it is set, callers
// should always use /attach instead.
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
mgr::load_tenant(
state.conf,
tenant_id,
generation,
state.broker_client.clone(),
state.remote_storage.clone(),
&state.deletion_queue,
&ctx,
)
.instrument(info_span!("load", %tenant_id))
@@ -869,21 +875,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
Ok(response)
}
/// Helper for requests that may take a generation, which is mandatory
/// when control_plane_api is set, but otherwise defaults to Generation::none()
fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
if state.conf.control_plane_api.is_some() {
req_gen
.map(Generation::new)
.ok_or(ApiError::BadRequest(anyhow!(
"generation attribute missing"
)))
} else {
// Legacy mode: all tenants operate with no generation
Ok(Generation::none())
}
}
async fn tenant_create_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
@@ -900,12 +891,16 @@ async fn tenant_create_handler(
let tenant_conf =
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
let state = get_state(&request);
let generation = get_request_generation(state, request_data.generation)?;
// TODO: make generation mandatory here once control plane supports it.
let generation = request_data
.generation
.map(|g| Generation::new(g))
.unwrap_or(Generation::none());
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let state = get_state(&request);
let new_tenant = mgr::create_tenant(
state.conf,
tenant_conf,
@@ -913,6 +908,7 @@ async fn tenant_create_handler(
generation,
state.broker_client.clone(),
state.remote_storage.clone(),
&state.deletion_queue,
&ctx,
)
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1153,6 +1149,48 @@ async fn always_panic_handler(
json_response(StatusCode::NO_CONTENT, ())
}
async fn deletion_queue_flush(
r: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&r);
if state.remote_storage.is_none() {
// Nothing to do if remote storage is disabled.
return json_response(StatusCode::OK, ());
}
let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
let queue_client = state.deletion_queue.new_client();
tokio::select! {
flush_result = async {
if execute {
queue_client.flush_execute().await
} else {
queue_client.flush().await
}
} => {
match flush_result {
Ok(())=> {
json_response(StatusCode::OK, ())
},
Err(e) => {
match e {
DeletionQueueError::ShuttingDown => {
Err(ApiError::ShuttingDown)
}
}
}
}
},
_ = cancel.cancelled() => {
Err(ApiError::ShuttingDown)
}
}
}
async fn disk_usage_eviction_run(
mut r: Request<Body>,
_cancel: CancellationToken,
@@ -1357,9 +1395,13 @@ where
}
pub fn make_router(
state: Arc<State>,
conf: &'static PageServerConf,
launch_ts: &'static LaunchTimestamp,
auth: Option<Arc<JwtAuth>>,
broker_client: BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue: DeletionQueue,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
let spec = include_bytes!("openapi_spec.yml");
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1383,7 +1425,17 @@ pub fn make_router(
);
Ok(router
.data(state)
.data(Arc::new(
State::new(
conf,
auth,
remote_storage,
deletion_queue,
broker_client,
disk_usage_eviction_state,
)
.context("Failed to initialize router state")?,
))
.get("/v1/status", |r| api_handler(r, status_handler))
.put("/v1/failpoints", |r| {
testing_api_handler("manage failpoints", r, failpoints_handler)
@@ -1463,6 +1515,9 @@ pub fn make_router(
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})
.put("/v1/deletion_queue/flush", |r| {
api_handler(r, deletion_queue_flush)
})
.put("/v1/tenant/:tenant_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})

View File

@@ -3,7 +3,7 @@ pub mod basebackup;
pub mod config;
pub mod consumption_metrics;
pub mod context;
mod control_plane_client;
pub mod deletion_queue;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;

View File

@@ -537,7 +537,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
30.000, // 30000 ms
];
/// VirtualFile fs operation variants.
/// Tracks time taken by fs operations near VirtualFile.
///
/// Operations:
/// - open ([`std::fs::OpenOptions::open`])
@@ -548,66 +548,15 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
/// - seek (modify internal position or file length query)
/// - fsync ([`std::fs::File::sync_all`])
/// - metadata ([`std::fs::File::metadata`])
#[derive(
Debug, Clone, Copy, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr,
)]
pub(crate) enum StorageIoOperation {
Open,
Close,
CloseByReplace,
Read,
Write,
Seek,
Fsync,
Metadata,
}
impl StorageIoOperation {
pub fn as_str(&self) -> &'static str {
match self {
StorageIoOperation::Open => "open",
StorageIoOperation::Close => "close",
StorageIoOperation::CloseByReplace => "close-by-replace",
StorageIoOperation::Read => "read",
StorageIoOperation::Write => "write",
StorageIoOperation::Seek => "seek",
StorageIoOperation::Fsync => "fsync",
StorageIoOperation::Metadata => "metadata",
}
}
}
/// Tracks time taken by fs operations near VirtualFile.
#[derive(Debug)]
pub(crate) struct StorageIoTime {
metrics: [Histogram; StorageIoOperation::COUNT],
}
impl StorageIoTime {
fn new() -> Self {
let storage_io_histogram_vec = register_histogram_vec!(
"pageserver_io_operations_seconds",
"Time spent in IO operations",
&["operation"],
STORAGE_IO_TIME_BUCKETS.into()
)
.expect("failed to define a metric");
let metrics = std::array::from_fn(|i| {
let op = StorageIoOperation::from_repr(i).unwrap();
let metric = storage_io_histogram_vec
.get_metric_with_label_values(&[op.as_str()])
.unwrap();
metric
});
Self { metrics }
}
pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram {
&self.metrics[op as usize]
}
}
pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);
pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_io_operations_seconds",
"Time spent in IO operations",
&["operation"],
STORAGE_IO_TIME_BUCKETS.into()
)
.expect("failed to define a metric")
});
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
@@ -846,6 +795,31 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
.expect("failed to define a metric")
});
pub(crate) static DELETION_QUEUE_SUBMITTED: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_deletion_queue_submitted_total",
"Number of objects submitted for deletion"
)
.expect("failed to define a metric")
});
pub(crate) static DELETION_QUEUE_EXECUTED: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_deletion_queue_executed_total",
"Number of objects deleted"
)
.expect("failed to define a metric")
});
pub(crate) static DELETION_QUEUE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_deletion_queue_errors_total",
"Incremented on retryable remote I/O errors writing deletion lists or executing deletions.",
&["op_kind"],
)
.expect("failed to define a metric")
});
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_remote_timeline_client_bytes_started",
@@ -1216,12 +1190,6 @@ impl TimelineMetrics {
),
}
}
pub fn record_new_file_metrics(&self, sz: u64) {
self.resident_physical_size_gauge.add(sz);
self.num_persistent_files_created.inc_by(1);
self.persistent_bytes_written.inc_by(sz);
}
}
impl Drop for TimelineMetrics {

View File

@@ -799,9 +799,8 @@ impl PageCache {
fn new(num_pages: usize) -> Self {
assert!(num_pages > 0, "page cache size must be > 0");
// We could use Vec::leak here, but that potentially also leaks
// uninitialized reserved capacity. With into_boxed_slice and Box::leak
// this is avoided.
// We use Box::leak here and into_boxed_slice to avoid leaking uninitialized
// memory that Vec's might contain.
let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;

View File

@@ -469,9 +469,7 @@ impl PageServerHandler {
// Create empty timeline
info!("creating new timeline");
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
let timeline = tenant
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
.await?;
let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
// TODO mark timeline as not ready until it reaches end_lsn.
// We might have some wal to import as well, and we should prevent compute

View File

@@ -32,7 +32,9 @@ use std::fmt::Debug;
use std::fmt::Display;
use std::fs;
use std::fs::File;
use std::fs::OpenOptions;
use std::io;
use std::io::Write;
use std::ops::Bound::Included;
use std::path::Path;
use std::path::PathBuf;
@@ -57,6 +59,7 @@ use self::timeline::EvictionTaskTenantState;
use self::timeline::TimelineResources;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT_ACTIVATION;
@@ -66,7 +69,7 @@ use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::metadata::load_metadata;
pub use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
use crate::tenant::storage_layer::DeltaLayer;
use crate::tenant::storage_layer::ImageLayer;
@@ -113,11 +116,12 @@ pub mod block_io;
pub mod disk_btree;
pub(crate) mod ephemeral_file;
pub mod layer_map;
pub mod manifest;
mod span;
pub mod metadata;
mod par_fsync;
mod remote_timeline_client;
pub mod remote_timeline_client;
pub mod storage_layer;
pub mod config;
@@ -141,9 +145,6 @@ pub use crate::tenant::metadata::save_metadata;
// re-export for use in walreceiver
pub use crate::tenant::timeline::WalReceiverInfo;
/// The "tenants" part of `tenants/<tenant>/timelines...`
pub const TENANTS_SEGMENT_NAME: &str = "tenants";
/// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
@@ -157,6 +158,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
pub struct TenantSharedResources {
pub broker_client: storage_broker::BrokerClientChannel,
pub remote_storage: Option<GenericRemoteStorage>,
pub deletion_queue_client: DeletionQueueClient,
}
///
@@ -180,8 +182,7 @@ pub struct Tenant {
tenant_id: TenantId,
/// The remote storage generation, used to protect S3 objects from split-brain.
/// Does not change over the lifetime of the [`Tenant`] object.
// The remote storage generation, used to protect S3 objects from split-brain
generation: Generation,
timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
@@ -195,7 +196,10 @@ pub struct Tenant {
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
// provides access to timeline data sitting in the remote storage
pub(crate) remote_storage: Option<GenericRemoteStorage>,
remote_storage: Option<GenericRemoteStorage>,
// Access to global deletion queue for when this tenant wants to schedule a deletion
deletion_queue_client: Option<DeletionQueueClient>,
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
@@ -407,6 +411,7 @@ impl Tenant {
remote_startup_data: Option<RemoteStartupData>,
local_metadata: Option<TimelineMetadata>,
ancestor: Option<Arc<Timeline>>,
first_save: bool,
init_order: Option<&InitializationOrder>,
_ctx: &RequestContext,
) -> anyhow::Result<()> {
@@ -440,9 +445,14 @@ impl Tenant {
// Save the metadata file to local disk.
if !picked_local {
save_metadata(self.conf, &tenant_id, &timeline_id, up_to_date_metadata)
.await
.context("save_metadata")?;
save_metadata(
self.conf,
&tenant_id,
&timeline_id,
up_to_date_metadata,
first_save,
)
.context("save_metadata")?;
}
let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
@@ -526,6 +536,7 @@ impl Tenant {
broker_client: storage_broker::BrokerClientChannel,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
// TODO dedup with spawn_load
@@ -541,6 +552,7 @@ impl Tenant {
tenant_id,
generation,
Some(remote_storage.clone()),
Some(deletion_queue_client),
));
// Do all the hard work in the background
@@ -726,6 +738,7 @@ impl Tenant {
remote_metadata,
TimelineResources {
remote_client: Some(remote_client),
deletion_queue_client: self.deletion_queue_client.clone(),
},
ctx,
)
@@ -750,6 +763,7 @@ impl Tenant {
timeline_id,
&index_part.metadata,
Some(remote_timeline_client),
self.deletion_queue_client.clone(),
None,
)
.await
@@ -827,6 +841,7 @@ impl Tenant {
}),
local_metadata,
ancestor,
true,
None,
ctx,
)
@@ -851,6 +866,7 @@ impl Tenant {
tenant_id,
Generation::broken(),
None,
None,
))
}
@@ -885,6 +901,7 @@ impl Tenant {
let broker_client = resources.broker_client;
let remote_storage = resources.remote_storage;
let deletion_queue_client = resources.deletion_queue_client;
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
let tenant = Tenant::new(
@@ -895,6 +912,7 @@ impl Tenant {
tenant_id,
generation,
remote_storage.clone(),
Some(deletion_queue_client),
);
let tenant = Arc::new(tenant);
@@ -1302,6 +1320,7 @@ impl Tenant {
timeline_id,
&local_metadata,
Some(remote_client),
self.deletion_queue_client.clone(),
init_order,
)
.await
@@ -1351,6 +1370,7 @@ impl Tenant {
timeline_id,
&local_metadata,
None,
None,
init_order,
)
.await
@@ -1379,6 +1399,7 @@ impl Tenant {
remote_startup_data,
Some(local_metadata),
ancestor,
false,
init_order,
ctx,
)
@@ -1442,7 +1463,7 @@ impl Tenant {
/// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
/// minimum amount of keys required to get a writable timeline.
/// (Without it, `put` might fail due to `repartition` failing.)
pub async fn create_empty_timeline(
pub fn create_empty_timeline(
&self,
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
@@ -1454,10 +1475,10 @@ impl Tenant {
"Cannot create empty timelines on inactive tenant"
);
let timeline_uninit_mark = {
let timelines = self.timelines.lock().unwrap();
self.create_timeline_uninit_mark(new_timeline_id, &timelines)?
};
let timelines = self.timelines.lock().unwrap();
let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?;
drop(timelines);
let new_metadata = TimelineMetadata::new(
// Initialize disk_consistent LSN to 0, The caller must import some data to
// make it valid, before calling finish_creation()
@@ -1476,7 +1497,6 @@ impl Tenant {
initdb_lsn,
None,
)
.await
}
/// Helper for unit tests to create an empty timeline.
@@ -1492,9 +1512,7 @@ impl Tenant {
pg_version: u32,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> {
let uninit_tl = self
.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
.await?;
let uninit_tl = self.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)?;
let tline = uninit_tl.raw_timeline().expect("we just created it");
assert_eq!(tline.get_last_record_lsn(), Lsn(0));
@@ -1512,15 +1530,6 @@ impl Tenant {
tline.maybe_spawn_flush_loop();
tline.freeze_and_flush().await.context("freeze_and_flush")?;
// Make sure the freeze_and_flush reaches remote storage.
tline
.remote_client
.as_ref()
.unwrap()
.wait_completion()
.await
.unwrap();
let tl = uninit_tl.finish_creation()?;
// The non-test code would call tl.activate() here.
tl.set_state(TimelineState::Active);
@@ -1697,6 +1706,65 @@ impl Tenant {
Ok(())
}
/// Flush all in-memory data to disk and remote storage, if any.
///
/// Used at graceful shutdown.
async fn freeze_and_flush_on_shutdown(&self) {
let mut js = tokio::task::JoinSet::new();
// execute on each timeline on the JoinSet, join after.
let per_timeline = |timeline_id: TimelineId, timeline: Arc<Timeline>| {
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
match timeline.freeze_and_flush().await {
Ok(()) => {}
Err(e) => {
warn!("failed to freeze and flush: {e:#}");
return;
}
}
let res = if let Some(client) = timeline.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
client.wait_completion().await
} else {
Ok(())
};
if let Err(e) = res {
warn!("failed to await for frozen and flushed uploads: {e:#}");
}
}
.instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id))
};
{
let timelines = self.timelines.lock().unwrap();
timelines
.iter()
.map(|(id, tl)| (*id, Arc::clone(tl)))
.for_each(|(timeline_id, timeline)| {
js.spawn(per_timeline(timeline_id, timeline));
})
};
while let Some(res) = js.join_next().await {
match res {
Ok(()) => {}
Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
Err(je) if je.is_panic() => { /* logged already */ }
Err(je) => warn!("unexpected JoinError: {je:?}"),
}
}
}
pub fn current_state(&self) -> TenantState {
self.state.borrow().clone()
}
@@ -1827,22 +1895,19 @@ impl Tenant {
}
};
let mut js = tokio::task::JoinSet::new();
{
let timelines = self.timelines.lock().unwrap();
timelines.values().for_each(|timeline| {
let timeline = Arc::clone(timeline);
let span = Span::current();
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
})
};
while let Some(res) = js.join_next().await {
match res {
Ok(()) => {}
Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
Err(je) if je.is_panic() => { /* logged already */ }
Err(je) => warn!("unexpected JoinError: {je:?}"),
}
if freeze_and_flush {
// walreceiver has already began to shutdown with TenantState::Stopping, but we need to
// await for them to stop.
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_id),
None,
)
.await;
// this will wait for uploads to complete; in the past, it was done outside tenant
// shutdown in pageserver::shutdown_pageserver.
self.freeze_and_flush_on_shutdown().await;
}
// shutdown all tenant and timeline tasks: gc, compaction, page service
@@ -2250,7 +2315,16 @@ impl Tenant {
tenant_id: TenantId,
generation: Generation,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: Option<DeletionQueueClient>,
) -> Tenant {
#[cfg(not(test))]
match state {
TenantState::Broken { .. } => {}
_ => {
// Non-broken tenants must be constructed with a deletion queue
assert!(deletion_queue_client.is_some());
}
}
let (state, mut rx) = watch::channel(state);
tokio::spawn(async move {
@@ -2317,6 +2391,7 @@ impl Tenant {
gc_cs: tokio::sync::Mutex::new(()),
walredo_mgr,
remote_storage,
deletion_queue_client,
state,
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2369,37 +2444,72 @@ impl Tenant {
Ok(tenant_conf)
}
#[tracing::instrument(skip_all, fields(%tenant_id))]
pub(super) async fn persist_tenant_config(
pub(super) fn persist_tenant_config(
tenant_id: &TenantId,
target_config_path: &Path,
tenant_conf: TenantConfOpt,
creating_tenant: bool,
) -> anyhow::Result<()> {
// imitate a try-block with a closure
info!("persisting tenantconf to {}", target_config_path.display());
let _enter = info_span!("saving tenantconf").entered();
let mut conf_content = r#"# This file contains a specific per-tenant's config.
// imitate a try-block with a closure
let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
let target_config_parent = target_config_path.parent().with_context(|| {
format!(
"Config path does not have a parent: {}",
target_config_path.display()
)
})?;
info!("persisting tenantconf to {}", target_config_path.display());
let mut conf_content = r#"# This file contains a specific per-tenant's config.
# It is read in case of pageserver restart.
[tenant_config]
"#
.to_string();
.to_string();
// Convert the config to a toml file.
conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
// Convert the config to a toml file.
conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
let conf_content = conf_content.as_bytes();
let mut target_config_file = VirtualFile::open_with_options(
target_config_path,
OpenOptions::new()
.truncate(true) // This needed for overwriting with small config files
.write(true)
.create_new(creating_tenant)
// when creating a new tenant, first_save will be true and `.create(true)` will be
// ignored (per rust std docs).
//
// later when updating the config of created tenant, or persisting config for the
// first time for attached tenant, the `.create(true)` is used.
.create(true),
)?;
let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
VirtualFile::crashsafe_overwrite(target_config_path, &temp_path, conf_content)
.await
.with_context(|| {
format!(
"write tenant {tenant_id} config to {}",
target_config_path.display()
)
})?;
Ok(())
target_config_file
.write(conf_content.as_bytes())
.context("write toml bytes into file")
.and_then(|_| target_config_file.sync_all().context("fsync config file"))
.context("write config file")?;
// fsync the parent directory to ensure the directory entry is durable.
// before this was done conditionally on creating_tenant, but these management actions are rare
// enough to just fsync it always.
crashsafe::fsync(target_config_parent)?;
// XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant`
Ok(())
};
// this function is called from creating the tenant and updating the tenant config, which
// would otherwise share this context, so keep it here in one place.
do_persist(target_config_path).with_context(|| {
format!(
"write tenant {tenant_id} config to {}",
target_config_path.display()
)
})
}
//
@@ -2710,15 +2820,13 @@ impl Tenant {
src_timeline.pg_version,
);
let uninitialized_timeline = self
.prepare_new_timeline(
dst_id,
&metadata,
timeline_uninit_mark,
start_lsn + 1,
Some(Arc::clone(src_timeline)),
)
.await?;
let uninitialized_timeline = self.prepare_new_timeline(
dst_id,
&metadata,
timeline_uninit_mark,
start_lsn + 1,
Some(Arc::clone(src_timeline)),
)?;
let new_timeline = uninitialized_timeline.finish_creation()?;
@@ -2796,15 +2904,13 @@ impl Tenant {
pgdata_lsn,
pg_version,
);
let raw_timeline = self
.prepare_new_timeline(
timeline_id,
&new_metadata,
timeline_uninit_mark,
pgdata_lsn,
None,
)
.await?;
let raw_timeline = self.prepare_new_timeline(
timeline_id,
&new_metadata,
timeline_uninit_mark,
pgdata_lsn,
None,
)?;
let tenant_id = raw_timeline.owning_tenant.tenant_id;
let unfinished_timeline = raw_timeline.raw_timeline()?;
@@ -2866,7 +2972,10 @@ impl Tenant {
None
};
TimelineResources { remote_client }
TimelineResources {
remote_client,
deletion_queue_client: self.deletion_queue_client.clone(),
}
}
/// Creates intermediate timeline structure and its files.
@@ -2875,7 +2984,7 @@ impl Tenant {
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
/// `finish_creation` to insert the Timeline into the timelines map and to remove the
/// uninit mark file.
async fn prepare_new_timeline(
fn prepare_new_timeline(
&self,
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
@@ -2903,9 +3012,8 @@ impl Tenant {
timeline_struct.init_empty_layer_map(start_lsn);
if let Err(e) = self
.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
.await
if let Err(e) =
self.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
{
error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}");
cleanup_timeline_directory(uninit_mark);
@@ -2921,7 +3029,7 @@ impl Tenant {
))
}
async fn create_timeline_files(
fn create_timeline_files(
&self,
timeline_path: &Path,
new_timeline_id: &TimelineId,
@@ -2933,9 +3041,14 @@ impl Tenant {
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
});
save_metadata(self.conf, &self.tenant_id, new_timeline_id, new_metadata)
.await
.context("Failed to create timeline metadata")?;
save_metadata(
self.conf,
&self.tenant_id,
new_timeline_id,
new_metadata,
true,
)
.context("Failed to create timeline metadata")?;
Ok(())
}
@@ -3082,7 +3195,7 @@ pub(crate) enum CreateTenantFilesMode {
Attach,
}
pub(crate) async fn create_tenant_files(
pub(crate) fn create_tenant_files(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: &TenantId,
@@ -3118,8 +3231,7 @@ pub(crate) async fn create_tenant_files(
mode,
&temporary_tenant_dir,
&target_tenant_directory,
)
.await;
);
if creation_result.is_err() {
error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data");
@@ -3137,7 +3249,7 @@ pub(crate) async fn create_tenant_files(
Ok(target_tenant_directory)
}
async fn try_create_target_tenant_dir(
fn try_create_target_tenant_dir(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: &TenantId,
@@ -3176,7 +3288,7 @@ async fn try_create_target_tenant_dir(
)
.with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf).await?;
Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
format!(
@@ -3381,8 +3493,6 @@ pub mod harness {
pub tenant_conf: TenantConf,
pub tenant_id: TenantId,
pub generation: Generation,
pub remote_storage: GenericRemoteStorage,
pub remote_fs_dir: PathBuf,
}
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3420,39 +3530,30 @@ pub mod harness {
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
fs::create_dir_all(conf.timelines_path(&tenant_id))?;
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
let remote_fs_dir = conf.workdir.join("localfs");
std::fs::create_dir_all(&remote_fs_dir).unwrap();
let config = RemoteStorageConfig {
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
Ok(Self {
conf,
tenant_conf,
tenant_id,
generation: Generation::new(0xdeadbeef),
remote_storage,
remote_fs_dir,
})
}
pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
(
self.try_load(&ctx)
self.try_load(&ctx, None, None)
.await
.expect("failed to load test tenant"),
ctx,
)
}
pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
pub async fn try_load(
&self,
ctx: &RequestContext,
remote_storage: Option<remote_storage::GenericRemoteStorage>,
deletion_queue_client: Option<DeletionQueueClient>,
) -> anyhow::Result<Arc<Tenant>> {
let walredo_mgr = Arc::new(TestRedoManager);
let tenant = Arc::new(Tenant::new(
@@ -3462,7 +3563,8 @@ pub mod harness {
walredo_mgr,
self.tenant_id,
self.generation,
Some(self.remote_storage.clone()),
remote_storage,
deletion_queue_client,
));
tenant
.load(None, ctx)
@@ -3575,10 +3677,7 @@ mod tests {
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
match tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await
{
match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) {
Ok(_) => panic!("duplicate timeline creation should fail"),
Err(e) => assert_eq!(
e.to_string(),
@@ -3933,13 +4032,6 @@ mod tests {
.create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
// so that all uploads finish & we can call harness.load() below again
tenant
.shutdown(Default::default(), true)
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
.await
.ok()
.unwrap();
}
let (tenant, _ctx) = harness.load().await;
@@ -3973,14 +4065,6 @@ mod tests {
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
// so that all uploads finish & we can call harness.load() below again
tenant
.shutdown(Default::default(), true)
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
.await
.ok()
.unwrap();
}
// check that both of them are initially unloaded
@@ -4033,13 +4117,6 @@ mod tests {
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
drop(tline);
// so that all uploads finish & we can call harness.try_load() below again
tenant
.shutdown(Default::default(), true)
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
.await
.ok()
.unwrap();
drop(tenant);
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -4051,7 +4128,11 @@ mod tests {
metadata_bytes[8] ^= 1;
std::fs::write(metadata_path, metadata_bytes)?;
let err = harness.try_load(&ctx).await.err().expect("should fail");
let err = harness
.try_load(&ctx, None, None)
.await
.err()
.expect("should fail");
// get all the stack with all .context, not only the last one
let message = format!("{err:#}");
let expected = "failed to load metadata";
@@ -4436,9 +4517,8 @@ mod tests {
.await;
let initdb_lsn = Lsn(0x20);
let utline = tenant
.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
.await?;
let utline =
tenant.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)?;
let tline = utline.raw_timeline().unwrap();
// Spawn flush loop now so that we can set the `expect_initdb_optimization`
@@ -4503,15 +4583,9 @@ mod tests {
let harness = TenantHarness::create(name)?;
{
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
.await?;
let tline =
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
// Keeps uninit mark in place
let raw_tline = tline.raw_timeline().unwrap();
raw_tline
.shutdown(false)
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
.await;
std::mem::forget(tline);
}

View File

@@ -13,7 +13,6 @@
//!
use crate::page_cache::PAGE_SZ;
use crate::tenant::block_io::BlockCursor;
use crate::virtual_file::VirtualFile;
use std::cmp::min;
use std::io::{Error, ErrorKind};
@@ -84,24 +83,35 @@ impl<'a> BlockCursor<'a> {
}
}
/// A wrapper of `VirtualFile` that allows users to write blobs.
///
/// If a `BlobWriter` is dropped, the internal buffer will be
/// discarded. You need to call [`flush_buffer`](Self::flush_buffer)
/// manually before dropping.
pub struct BlobWriter<const BUFFERED: bool> {
inner: VirtualFile,
offset: u64,
/// A buffer to save on write calls, only used if BUFFERED=true
buf: Vec<u8>,
/// Abstract trait for a data sink that you can write blobs to.
///
pub trait BlobWriter {
/// Write a blob of data. Returns the offset that it was written to,
/// which can be used to retrieve the data later.
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error>;
}
impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
Self {
///
/// An implementation of BlobWriter to write blobs to anything that
/// implements std::io::Write.
///
pub struct WriteBlobWriter<W>
where
W: std::io::Write,
{
inner: W,
offset: u64,
}
impl<W> WriteBlobWriter<W>
where
W: std::io::Write,
{
pub fn new(inner: W, start_offset: u64) -> Self {
WriteBlobWriter {
inner,
offset: start_offset,
buf: Vec::with_capacity(Self::CAPACITY),
}
}
@@ -109,79 +119,28 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
self.offset
}
const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
#[inline(always)]
/// Writes the given buffer directly to the underlying `VirtualFile`.
/// You need to make sure that the internal buffer is empty, otherwise
/// data will be written in wrong order.
async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> {
self.inner.write_all(src_buf).await?;
self.offset += src_buf.len() as u64;
Ok(())
/// Access the underlying Write object.
///
/// NOTE: WriteBlobWriter keeps track of the current write offset. If
/// you write something directly to the inner Write object, it makes the
/// internally tracked 'offset' to go out of sync. So don't do that.
pub fn into_inner(self) -> W {
self.inner
}
}
#[inline(always)]
/// Flushes the internal buffer to the underlying `VirtualFile`.
pub async fn flush_buffer(&mut self) -> Result<(), Error> {
self.inner.write_all(&self.buf).await?;
self.buf.clear();
Ok(())
}
#[inline(always)]
/// Writes as much of `src_buf` into the internal buffer as it fits
fn write_into_buffer(&mut self, src_buf: &[u8]) -> usize {
let remaining = Self::CAPACITY - self.buf.len();
let to_copy = src_buf.len().min(remaining);
self.buf.extend_from_slice(&src_buf[..to_copy]);
self.offset += to_copy as u64;
to_copy
}
/// Internal, possibly buffered, write function
async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> {
if !BUFFERED {
assert!(self.buf.is_empty());
self.write_all_unbuffered(src_buf).await?;
return Ok(());
}
let remaining = Self::CAPACITY - self.buf.len();
// First try to copy as much as we can into the buffer
if remaining > 0 {
let copied = self.write_into_buffer(src_buf);
src_buf = &src_buf[copied..];
}
// Then, if the buffer is full, flush it out
if self.buf.len() == Self::CAPACITY {
self.flush_buffer().await?;
}
// Finally, write the tail of src_buf:
// If it wholly fits into the buffer without
// completely filling it, then put it there.
// If not, write it out directly.
if !src_buf.is_empty() {
assert_eq!(self.buf.len(), 0);
if src_buf.len() < Self::CAPACITY {
let copied = self.write_into_buffer(src_buf);
// We just verified above that src_buf fits into our internal buffer.
assert_eq!(copied, src_buf.len());
} else {
self.write_all_unbuffered(src_buf).await?;
}
}
Ok(())
}
/// Write a blob of data. Returns the offset that it was written to,
/// which can be used to retrieve the data later.
pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
impl<W> BlobWriter for WriteBlobWriter<W>
where
W: std::io::Write,
{
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
let offset = self.offset;
if srcbuf.len() < 128 {
// Short blob. Write a 1-byte length header
let len_buf = srcbuf.len() as u8;
self.write_all(&[len_buf]).await?;
self.inner.write_all(&[len_buf])?;
self.offset += 1;
} else {
// Write a 4-byte length header
if srcbuf.len() > 0x7fff_ffff {
@@ -192,153 +151,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
}
let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
len_buf[0] |= 0x80;
self.write_all(&len_buf).await?;
self.inner.write_all(&len_buf)?;
self.offset += 4;
}
self.write_all(srcbuf).await?;
self.inner.write_all(srcbuf)?;
self.offset += srcbuf.len() as u64;
Ok(offset)
}
}
impl BlobWriter<true> {
/// Access the underlying `VirtualFile`.
///
/// This function flushes the internal buffer before giving access
/// to the underlying `VirtualFile`.
pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
self.flush_buffer().await?;
Ok(self.inner)
}
/// Access the underlying `VirtualFile`.
///
/// Unlike [`into_inner`](Self::into_inner), this doesn't flush
/// the internal buffer before giving access.
pub fn into_inner_no_flush(self) -> VirtualFile {
self.inner
}
}
impl BlobWriter<false> {
/// Access the underlying `VirtualFile`.
pub fn into_inner(self) -> VirtualFile {
self.inner
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tenant::block_io::BlockReaderRef;
use rand::{Rng, SeedableRng};
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
let temp_dir = tempfile::tempdir()?;
let path = temp_dir.path().join("file");
// Write part (in block to drop the file)
let mut offsets = Vec::new();
{
let file = VirtualFile::create(&path).await?;
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
for blob in blobs.iter() {
let offs = wtr.write_blob(blob).await?;
offsets.push(offs);
}
// Write out one page worth of zeros so that we can
// read again with read_blk
let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?;
println!("Writing final blob at offs={offs}");
wtr.flush_buffer().await?;
}
let file = VirtualFile::open(&path).await?;
let rdr = BlockReaderRef::VirtualFile(&file);
let rdr = BlockCursor::new(rdr);
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
let blob_read = rdr.read_blob(*offset).await?;
assert_eq!(
blob, &blob_read,
"mismatch for idx={idx} at offset={offset}"
);
}
Ok(())
}
fn random_array(len: usize) -> Vec<u8> {
let mut rng = rand::thread_rng();
(0..len).map(|_| rng.gen()).collect::<_>()
}
#[tokio::test]
async fn test_one() -> Result<(), Error> {
let blobs = &[vec![12, 21, 22]];
round_trip_test::<false>(blobs).await?;
round_trip_test::<true>(blobs).await?;
Ok(())
}
#[tokio::test]
async fn test_hello_simple() -> Result<(), Error> {
let blobs = &[
vec![0, 1, 2, 3],
b"Hello, World!".to_vec(),
Vec::new(),
b"foobar".to_vec(),
];
round_trip_test::<false>(blobs).await?;
round_trip_test::<true>(blobs).await?;
Ok(())
}
#[tokio::test]
async fn test_really_big_array() -> Result<(), Error> {
let blobs = &[
b"test".to_vec(),
random_array(10 * PAGE_SZ),
b"foobar".to_vec(),
];
round_trip_test::<false>(blobs).await?;
round_trip_test::<true>(blobs).await?;
Ok(())
}
#[tokio::test]
async fn test_arrays_inc() -> Result<(), Error> {
let blobs = (0..PAGE_SZ / 8)
.map(|v| random_array(v * 16))
.collect::<Vec<_>>();
round_trip_test::<false>(&blobs).await?;
round_trip_test::<true>(&blobs).await?;
Ok(())
}
#[tokio::test]
async fn test_arrays_random_size() -> Result<(), Error> {
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
let blobs = (0..1024)
.map(|_| {
let mut sz: u16 = rng.gen();
// Make 50% of the arrays small
if rng.gen() {
sz |= 63;
}
random_array(sz.into())
})
.collect::<Vec<_>>();
round_trip_test::<false>(&blobs).await?;
round_trip_test::<true>(&blobs).await?;
Ok(())
}
#[tokio::test]
async fn test_arrays_page_boundary() -> Result<(), Error> {
let blobs = &[
random_array(PAGE_SZ - 4),
random_array(PAGE_SZ - 4),
random_array(PAGE_SZ - 4),
];
round_trip_test::<false>(blobs).await?;
round_trip_test::<true>(blobs).await?;
Ok(())
}
}

View File

@@ -7,7 +7,9 @@ use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
use crate::virtual_file::VirtualFile;
use bytes::Bytes;
use std::fs::File;
use std::ops::{Deref, DerefMut};
use std::os::unix::fs::FileExt;
/// This is implemented by anything that can read 8 kB (PAGE_SZ)
/// blocks, using the page cache
@@ -71,13 +73,12 @@ impl<'a> Deref for BlockLease<'a> {
///
/// Unlike traits, we also support the read function to be async though.
pub(crate) enum BlockReaderRef<'a> {
FileBlockReader(&'a FileBlockReader),
FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
EphemeralFile(&'a EphemeralFile),
Adapter(Adapter<&'a DeltaLayerInner>),
#[cfg(test)]
TestDisk(&'a super::disk_btree::tests::TestDisk),
#[cfg(test)]
VirtualFile(&'a VirtualFile),
}
impl<'a> BlockReaderRef<'a> {
@@ -85,13 +86,12 @@ impl<'a> BlockReaderRef<'a> {
async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
use BlockReaderRef::*;
match self {
FileBlockReader(r) => r.read_blk(blknum).await,
FileBlockReaderVirtual(r) => r.read_blk(blknum).await,
FileBlockReaderFile(r) => r.read_blk(blknum).await,
EphemeralFile(r) => r.read_blk(blknum).await,
Adapter(r) => r.read_blk(blknum).await,
#[cfg(test)]
TestDisk(r) => r.read_blk(blknum),
#[cfg(test)]
VirtualFile(r) => r.read_blk(blknum).await,
}
}
}
@@ -105,7 +105,7 @@ impl<'a> BlockReaderRef<'a> {
///
/// ```no_run
/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
/// # let reader: FileBlockReader = unimplemented!("stub");
/// # let reader: FileBlockReader<std::fs::File> = unimplemented!("stub");
/// let cursor = reader.block_cursor();
/// let buf = cursor.read_blk(1);
/// // do stuff with 'buf'
@@ -122,9 +122,9 @@ impl<'a> BlockCursor<'a> {
BlockCursor { reader }
}
// Needed by cli
pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
BlockCursor {
reader: BlockReaderRef::FileBlockReader(reader),
reader: BlockReaderRef::FileBlockReaderVirtual(reader),
}
}
@@ -143,26 +143,27 @@ impl<'a> BlockCursor<'a> {
///
/// The file is assumed to be immutable. This doesn't provide any functions
/// for modifying the file, nor for invalidating the cache if it is modified.
pub struct FileBlockReader {
pub file: VirtualFile,
pub struct FileBlockReader<F> {
pub file: F,
/// Unique ID of this file, used as key in the page cache.
file_id: page_cache::FileId,
}
impl FileBlockReader {
pub fn new(file: VirtualFile) -> Self {
impl<F> FileBlockReader<F>
where
F: FileExt,
{
pub fn new(file: F) -> Self {
let file_id = page_cache::next_file_id();
FileBlockReader { file_id, file }
}
/// Read a page from the underlying file into given buffer.
async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
assert!(buf.len() == PAGE_SZ);
self.file
.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
.await
self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
}
/// Read a block.
///
@@ -184,7 +185,7 @@ impl FileBlockReader {
ReadBufResult::Found(guard) => break Ok(guard.into()),
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
self.fill_buffer(write_guard.deref_mut(), blknum)?;
write_guard.mark_valid();
// Swap for read lock
@@ -195,9 +196,15 @@ impl FileBlockReader {
}
}
impl BlockReader for FileBlockReader {
impl BlockReader for FileBlockReader<File> {
fn block_cursor(&self) -> BlockCursor<'_> {
BlockCursor::new(BlockReaderRef::FileBlockReader(self))
BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
}
}
impl BlockReader for FileBlockReader<VirtualFile> {
fn block_cursor(&self) -> BlockCursor<'_> {
BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
}
}

View File

@@ -9,6 +9,7 @@ use std::cmp::min;
use std::fs::OpenOptions;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::os::unix::prelude::FileExt;
use std::path::PathBuf;
use std::sync::atomic::AtomicU64;
use tracing::*;
@@ -28,7 +29,7 @@ pub struct EphemeralFile {
}
impl EphemeralFile {
pub async fn create(
pub fn create(
conf: &PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -44,8 +45,7 @@ impl EphemeralFile {
let file = VirtualFile::open_with_options(
&filename,
OpenOptions::new().read(true).write(true).create(true),
)
.await?;
)?;
Ok(EphemeralFile {
page_cache_file_id: page_cache::next_file_id(),
@@ -88,8 +88,7 @@ impl EphemeralFile {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
self.file
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
.await?;
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
write_guard.mark_valid();
// Swap for read lock
@@ -129,15 +128,10 @@ impl EphemeralFile {
self.off += n;
src_remaining = &src_remaining[n..];
if self.off == PAGE_SZ {
match self
.ephemeral_file
.file
.write_all_at(
&self.ephemeral_file.mutable_tail,
self.blknum as u64 * PAGE_SZ as u64,
)
.await
{
match self.ephemeral_file.file.write_all_at(
&self.ephemeral_file.mutable_tail,
self.blknum as u64 * PAGE_SZ as u64,
) {
Ok(_) => {
// Pre-warm the page cache with what we just wrote.
// This isn't necessary for coherency/correctness, but it's how we've always done it.
@@ -287,7 +281,7 @@ mod tests {
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
let pos_foo = file.write_blob(b"foo").await?;
assert_eq!(

View File

@@ -0,0 +1,325 @@
//! This module contains the encoding and decoding of the local manifest file.
//!
//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
//! records the state of the storage engine. It contains a snapshot of the
//! state and all operations proceeding that snapshot. The file begins with a
//! header recording MANIFEST version number. After that, it contains a snapshot.
//! The snapshot is followed by a list of operations. Each operation is a list
//! of records. Each record is either an addition or a removal of a layer.
//!
//! With MANIFEST, we can:
//!
//! 1. recover state quickly by reading the file, potentially boosting the
//! startup speed.
//! 2. ensure all operations are atomic and avoid corruption, solving issues
//! like redundant image layer and preparing us for future compaction
//! strategies.
//!
//! There is also a format for storing all layer files on S3, called
//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
//! records all operations as logs, and therefore we can easily replay the
//! operations when recovering from crash, while ensuring those operations
//! are atomic upon restart.
//!
//! Currently, this is not used in the system. Future refactors will ensure
//! the storage state will be recorded in this file, and the system can be
//! recovered from this file. This is tracked in
//! <https://github.com/neondatabase/neon/issues/4418>
use std::io::{self, Read, Write};
use crate::virtual_file::VirtualFile;
use anyhow::Result;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use crc32c::crc32c;
use serde::{Deserialize, Serialize};
use tracing::log::warn;
use utils::lsn::Lsn;
use super::storage_layer::PersistentLayerDesc;
pub struct Manifest {
file: VirtualFile,
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub struct Snapshot {
pub layers: Vec<PersistentLayerDesc>,
}
/// serde by default encode this in tagged enum, and therefore it will be something
/// like `{ "AddLayer": { ... } }`.
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub enum Record {
AddLayer(PersistentLayerDesc),
RemoveLayer(PersistentLayerDesc),
}
/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
const MANIFEST_VERSION: u64 = 1;
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub struct ManifestHeader {
magic_number: u64,
version: u64,
}
const MANIFEST_HEADER_LEN: usize = 16;
impl ManifestHeader {
fn encode(&self) -> BytesMut {
let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
buf.put_u64(self.magic_number);
buf.put_u64(self.version);
buf
}
fn decode(mut buf: &[u8]) -> Self {
assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
Self {
magic_number: buf.get_u64(),
version: buf.get_u64(),
}
}
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub enum Operation {
/// A snapshot of the current state.
///
/// Lsn field represents the LSN that is persisted to disk for this snapshot.
Snapshot(Snapshot, Lsn),
/// An atomic operation that changes the state.
///
/// Lsn field represents the LSN that is persisted to disk after the operation is done.
/// This will only change when new L0 is flushed to the disk.
Operation(Vec<Record>, Lsn),
}
struct RecordHeader {
size: u32,
checksum: u32,
}
const RECORD_HEADER_LEN: usize = 8;
impl RecordHeader {
fn encode(&self) -> BytesMut {
let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
buf.put_u32(self.size);
buf.put_u32(self.checksum);
buf
}
fn decode(mut buf: &[u8]) -> Self {
assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
Self {
size: buf.get_u32(),
checksum: buf.get_u32(),
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum ManifestLoadError {
#[error("manifest header is corrupted")]
CorruptedManifestHeader,
#[error("unsupported manifest version: got {0}, expected {1}")]
UnsupportedVersion(u64, u64),
#[error("error when decoding record: {0}")]
DecodeRecord(serde_json::Error),
#[error("I/O error: {0}")]
Io(io::Error),
}
#[must_use = "Should check if the manifest is partially corrupted"]
pub struct ManifestPartiallyCorrupted(bool);
impl Manifest {
/// Create a new manifest by writing the manifest header and a snapshot record to the given file.
pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
let mut manifest = Self { file };
manifest.append_manifest_header(ManifestHeader {
magic_number: MANIFEST_MAGIC_NUMBER,
version: MANIFEST_VERSION,
})?;
manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
Ok(manifest)
}
/// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
/// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
/// backup the current one.
pub fn load(
mut file: VirtualFile,
) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
let mut buf = vec![];
file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
// Read manifest header
let mut buf = Bytes::from(buf);
if buf.remaining() < MANIFEST_HEADER_LEN {
return Err(ManifestLoadError::CorruptedManifestHeader);
}
let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
buf.advance(MANIFEST_HEADER_LEN);
if header.version != MANIFEST_VERSION {
return Err(ManifestLoadError::UnsupportedVersion(
header.version,
MANIFEST_VERSION,
));
}
// Read operations
let mut operations = Vec::new();
let corrupted = loop {
if buf.remaining() == 0 {
break false;
}
if buf.remaining() < RECORD_HEADER_LEN {
warn!("incomplete header when decoding manifest, could be corrupted");
break true;
}
let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
let size = size as usize;
buf.advance(RECORD_HEADER_LEN);
if buf.remaining() < size {
warn!("incomplete data when decoding manifest, could be corrupted");
break true;
}
let data = &buf[..size];
if crc32c(data) != checksum {
warn!("checksum mismatch when decoding manifest, could be corrupted");
break true;
}
// if the following decode fails, we cannot use the manifest or safely ignore any record.
operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
buf.advance(size);
};
Ok((
Self { file },
operations,
ManifestPartiallyCorrupted(corrupted),
))
}
fn append_data(&mut self, data: &[u8]) -> Result<()> {
if data.len() >= u32::MAX as usize {
panic!("data too large");
}
let header = RecordHeader {
size: data.len() as u32,
checksum: crc32c(data),
};
let header = header.encode();
self.file.write_all(&header)?;
self.file.write_all(data)?;
self.file.sync_all()?;
Ok(())
}
fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
let encoded = header.encode();
self.file.write_all(&encoded)?;
Ok(())
}
/// Add an operation to the manifest. The operation will be appended to the end of the file,
/// and the file will fsync.
pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
let encoded = Vec::from(serde_json::to_string(&operation)?);
self.append_data(&encoded)
}
}
#[cfg(test)]
mod tests {
use std::fs::OpenOptions;
use crate::repository::Key;
use super::*;
#[test]
fn test_read_manifest() {
let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
std::fs::create_dir_all(&testdir).unwrap();
let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
// Write a manifest with a snapshot and some operations
let snapshot = Snapshot {
layers: vec![layer1, layer2],
};
let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
manifest
.append_operation(Operation::Operation(
vec![Record::AddLayer(layer3.clone())],
Lsn::from(1),
))
.unwrap();
drop(manifest);
// Open the second time and write
let file = VirtualFile::open_with_options(
&testdir.join("MANIFEST"),
OpenOptions::new()
.read(true)
.write(true)
.create_new(false)
.truncate(false),
)
.unwrap();
let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
assert!(!corrupted.0);
assert_eq!(operations.len(), 2);
assert_eq!(
&operations[0],
&Operation::Snapshot(snapshot.clone(), Lsn::from(0))
);
assert_eq!(
&operations[1],
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
);
manifest
.append_operation(Operation::Operation(
vec![
Record::RemoveLayer(layer3.clone()),
Record::AddLayer(layer4.clone()),
],
Lsn::from(2),
))
.unwrap();
drop(manifest);
// Open the third time and verify
let file = VirtualFile::open_with_options(
&testdir.join("MANIFEST"),
OpenOptions::new()
.read(true)
.write(true)
.create_new(false)
.truncate(false),
)
.unwrap();
let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
assert!(!corrupted.0);
assert_eq!(operations.len(), 3);
assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
assert_eq!(
&operations[1],
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
);
assert_eq!(
&operations[2],
&Operation::Operation(
vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
Lsn::from(2)
)
);
}
}

View File

@@ -8,13 +8,14 @@
//!
//! [`remote_timeline_client`]: super::remote_timeline_client
use std::io::{self};
use std::fs::{File, OpenOptions};
use std::io::{self, Write};
use anyhow::{ensure, Context};
use anyhow::{bail, ensure, Context};
use serde::{de::Error, Deserialize, Serialize, Serializer};
use thiserror::Error;
use tracing::info_span;
use utils::bin_ser::SerializeError;
use utils::crashsafe::path_with_suffix_extension;
use utils::{
bin_ser::BeSer,
id::{TenantId, TimelineId},
@@ -23,7 +24,6 @@ use utils::{
use crate::config::PageServerConf;
use crate::virtual_file::VirtualFile;
use crate::TEMP_FILE_SUFFIX;
/// Use special format number to enable backward compatibility.
const METADATA_FORMAT_VERSION: u16 = 4;
@@ -230,23 +230,6 @@ impl TimelineMetadata {
pub fn pg_version(&self) -> u32 {
self.body.pg_version
}
// Checksums make it awkward to build a valid instance by hand. This helper
// provides a TimelineMetadata with a valid checksum in its header.
#[cfg(test)]
pub fn example() -> Self {
let instance = Self::new(
"0/16960E8".parse::<Lsn>().unwrap(),
None,
None,
Lsn::from_hex("00000000").unwrap(),
Lsn::from_hex("00000000").unwrap(),
Lsn::from_hex("00000000").unwrap(),
0,
);
let bytes = instance.to_bytes().unwrap();
Self::from_bytes(&bytes).unwrap()
}
}
impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -272,19 +255,38 @@ impl Serialize for TimelineMetadata {
}
/// Save timeline metadata to file
#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
pub async fn save_metadata(
pub fn save_metadata(
conf: &'static PageServerConf,
tenant_id: &TenantId,
timeline_id: &TimelineId,
data: &TimelineMetadata,
first_save: bool,
) -> anyhow::Result<()> {
let _enter = info_span!("saving metadata").entered();
let path = conf.metadata_path(tenant_id, timeline_id);
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
.await
.context("write metadata")?;
// use OpenOptions to ensure file presence is consistent with first_save
let mut file = VirtualFile::open_with_options(
&path,
OpenOptions::new().write(true).create_new(first_save),
)
.context("open_with_options")?;
let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
if file.write(&metadata_bytes)? != metadata_bytes.len() {
bail!("Could not write all the metadata bytes in a single call");
}
file.sync_all()?;
// fsync the parent directory to ensure the directory entry is durable
if first_save {
let timeline_dir = File::open(
path.parent()
.expect("Metadata should always have a parent dir"),
)?;
timeline_dir.sync_all()?;
}
Ok(())
}

View File

@@ -1,18 +1,19 @@
//! This module acts as a switchboard to access different repositories managed by this
//! page server.
use rand::{distributions::Alphanumeric, Rng};
use hyper::StatusCode;
use pageserver_api::control_api::{HexTenantId, ReAttachRequest, ReAttachResponse};
use std::collections::{hash_map, HashMap};
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use std::path::Path;
use std::sync::Arc;
use std::time::Duration;
use tokio::fs;
use anyhow::Context;
use once_cell::sync::Lazy;
use tokio::sync::RwLock;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::*;
use remote_storage::GenericRemoteStorage;
@@ -20,14 +21,13 @@ use utils::crashsafe;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::control_plane_client::ControlPlaneClient;
use crate::deletion_queue::DeletionQueue;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
use utils::crashsafe::path_with_suffix_extension;
use utils::fs_ext::PathExt;
use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
@@ -64,39 +64,6 @@ impl TenantsMap {
}
}
/// This is "safe" in that that it won't leave behind a partially deleted directory
/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
/// the contents.
///
/// This is pageserver-specific, as it relies on future processes after a crash to check
/// for TEMP_FILE_SUFFIX when loading things.
async fn safe_remove_tenant_dir_all(path: impl AsRef<Path>) -> std::io::Result<()> {
let tmp_path = safe_rename_tenant_dir(path).await?;
fs::remove_dir_all(tmp_path).await
}
async fn safe_rename_tenant_dir(path: impl AsRef<Path>) -> std::io::Result<PathBuf> {
let parent = path
.as_ref()
.parent()
// It is invalid to call this function with a relative path. Tenant directories
// should always have a parent.
.ok_or(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"Path must be absolute",
))?;
let rand_suffix = rand::thread_rng()
.sample_iter(&Alphanumeric)
.take(8)
.map(char::from)
.collect::<String>()
+ TEMP_FILE_SUFFIX;
let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
fs::rename(&path, &tmp_path).await?;
fs::File::open(parent).await?.sync_all().await?;
Ok(tmp_path)
}
static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
/// Initialize repositories with locally available timelines.
@@ -107,19 +74,82 @@ pub async fn init_tenant_mgr(
conf: &'static PageServerConf,
resources: TenantSharedResources,
init_order: InitializationOrder,
cancel: CancellationToken,
) -> anyhow::Result<()> {
// Scan local filesystem for attached tenants
let tenants_dir = conf.tenants_path();
let mut tenants = HashMap::new();
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
Some(client.re_attach().await?)
} else {
info!("Control plane API not configured, tenant generations are disabled");
None
// If we are configured to use the control plane API, then it is the source of truth for what to attach
let tenant_generations = conf
.control_plane_api
.as_ref()
.map(|control_plane_api| async {
let client = reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client");
// FIXME: it's awkward that join() requires the base to have a trailing slash, makes
// it easy to get a config wrong
assert!(
control_plane_api.as_str().ends_with("/"),
"control plane API needs trailing slash"
);
let re_attach_path = control_plane_api
.join("re-attach")
.expect("Failed to build re-attach path");
let request = ReAttachRequest { node_id: conf.id };
// TODO: we should have been passed a cancellation token, and use it to end
// this loop gracefully
loop {
let response = match client
.post(re_attach_path.clone())
.json(&request)
.send()
.await
{
Err(e) => Err(anyhow::Error::from(e)),
Ok(r) => {
if r.status() == StatusCode::OK {
r.json::<ReAttachResponse>()
.await
.map_err(|e| anyhow::Error::from(e))
} else {
Err(anyhow::anyhow!("Unexpected status {}", r.status()))
}
}
};
match response {
Ok(res) => {
tracing::info!(
"Received re-attach response with {0} tenants",
res.tenants.len()
);
// TODO: do something with it
break res
.tenants
.into_iter()
.map(|t| (t.id, t.generation))
.collect::<HashMap<_, _>>();
}
Err(e) => {
tracing::error!("Error re-attaching tenants, retrying: {e:#}");
tokio::time::sleep(Duration::from_secs(1)).await;
}
}
}
});
let tenant_generations = match tenant_generations {
Some(g) => Some(g.await),
None => {
info!("Control plane API not configured, tenant generations are disabled");
None
}
};
let mut dir_entries = fs::read_dir(&tenants_dir)
@@ -138,8 +168,6 @@ pub async fn init_tenant_mgr(
"Found temporary tenant directory, removing: {}",
tenant_dir_path.display()
);
// No need to use safe_remove_tenant_dir_all because this is already
// a temporary path
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
error!(
"Failed to remove temporary directory '{}': {:?}",
@@ -180,7 +208,7 @@ pub async fn init_tenant_mgr(
Ok(id) => id,
Err(_) => {
warn!(
"Invalid tenant path (garbage in our repo directory?): {}",
"Invalid tenant path (garbage in our repo directory?): {0}",
tenant_dir_path.display()
);
continue;
@@ -190,11 +218,11 @@ pub async fn init_tenant_mgr(
let generation = if let Some(generations) = &tenant_generations {
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
if let Some(gen) = generations.get(&tenant_id) {
*gen
if let Some(gen) = generations.get(&HexTenantId::new(tenant_id)) {
Generation::new(*gen)
} else {
info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
info!("Detaching tenant {0}, control plane omitted it in re-attach response", tenant_id);
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
error!(
"Failed to remove detached tenant directory '{}': {:?}",
tenant_dir_path.display(),
@@ -207,7 +235,7 @@ pub async fn init_tenant_mgr(
// Legacy mode: no generation information, any tenant present
// on local disk may activate
info!(
"Starting tenant {} in legacy mode, no generation",
"Starting tenant {0} in legacy mode, no generation",
tenant_dir_path.display()
);
Generation::none()
@@ -251,7 +279,6 @@ pub async fn init_tenant_mgr(
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn schedule_local_tenant_processing(
conf: &'static PageServerConf,
tenant_id: TenantId,
@@ -293,6 +320,7 @@ pub(crate) fn schedule_local_tenant_processing(
resources.broker_client,
tenants,
remote_storage,
resources.deletion_queue_client,
ctx,
) {
Ok(tenant) => tenant,
@@ -440,19 +468,21 @@ pub async fn create_tenant(
generation: Generation,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue: &DeletionQueue,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
tenant_map_insert(tenant_id, || {
// We're holding the tenants lock in write mode while doing local IO.
// If this section ever becomes contentious, introduce a new `TenantState::Creating`
// and do the work in that state.
let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
let tenant_resources = TenantSharedResources {
broker_client,
remote_storage,
deletion_queue_client: deletion_queue.new_client(),
};
let created_tenant =
schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
@@ -486,8 +516,7 @@ pub async fn set_new_tenant_config(
let tenant = get_tenant(tenant_id, true).await?;
let tenant_config_path = conf.tenant_config_path(&tenant_id);
Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf)
.await
Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false)
.map_err(SetNewTenantConfigError::Persist)?;
tenant.set_new_tenant_config(new_tenant_conf);
Ok(())
@@ -503,8 +532,6 @@ pub enum GetTenantError {
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
///
/// This method is cancel-safe.
pub async fn get_tenant(
tenant_id: TenantId,
active_only: bool,
@@ -564,24 +591,7 @@ pub async fn detach_tenant(
tenant_id: TenantId,
detach_ignored: bool,
) -> Result<(), TenantStateError> {
let tmp_path = detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await?;
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
let task_tenant_id = None;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::MgmtRequest,
task_tenant_id,
None,
"tenant_files_delete",
false,
async move {
fs::remove_dir_all(tmp_path.as_path())
.await
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
},
);
Ok(())
detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
}
async fn detach_tenant0(
@@ -589,16 +599,20 @@ async fn detach_tenant0(
tenants: &tokio::sync::RwLock<TenantsMap>,
tenant_id: TenantId,
detach_ignored: bool,
) -> Result<PathBuf, TenantStateError> {
let tenant_dir_rename_operation = |tenant_id_to_clean| async move {
) -> Result<(), TenantStateError> {
let local_files_cleanup_operation = |tenant_id_to_clean| async move {
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
safe_rename_tenant_dir(&local_tenant_directory)
fs::remove_dir_all(&local_tenant_directory)
.await
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
.with_context(|| {
format!("local tenant directory {local_tenant_directory:?} removal")
})?;
Ok(())
};
let removal_result =
remove_tenant_from_memory(tenants, tenant_id, tenant_dir_rename_operation(tenant_id)).await;
remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
.await;
// Ignored tenants are not present in memory and will bail the removal from memory operation.
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
@@ -606,10 +620,10 @@ async fn detach_tenant0(
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
if tenant_ignore_mark.exists() {
info!("Detaching an ignored tenant");
let tmp_path = tenant_dir_rename_operation(tenant_id)
local_files_cleanup_operation(tenant_id)
.await
.with_context(|| format!("Ignored tenant {tenant_id} local directory rename"))?;
return Ok(tmp_path);
.with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?;
return Ok(());
}
}
@@ -619,12 +633,12 @@ async fn detach_tenant0(
pub async fn load_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
generation: Generation,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue: &DeletionQueue,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
tenant_map_insert(tenant_id, || {
let tenant_path = conf.tenant_path(&tenant_id);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
if tenant_ignore_mark.exists() {
@@ -635,8 +649,11 @@ pub async fn load_tenant(
let resources = TenantSharedResources {
broker_client,
remote_storage,
deletion_queue_client: deletion_queue.new_client(),
};
let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx)
// TODO: remove the `/load` API once generation support is complete:
// it becomes equivalent to attaching.
let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, Generation::none(), resources, None, &TENANTS, ctx)
.with_context(|| {
format!("Failed to schedule tenant processing in path {tenant_path:?}")
})?;
@@ -704,10 +721,11 @@ pub async fn attach_tenant(
tenant_conf: TenantConfOpt,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: GenericRemoteStorage,
deletion_queue: &DeletionQueue,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
tenant_map_insert(tenant_id, || {
let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
@@ -721,6 +739,7 @@ pub async fn attach_tenant(
let resources = TenantSharedResources {
broker_client,
remote_storage: Some(remote_storage),
deletion_queue_client: deletion_queue.new_client(),
};
let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
@@ -755,13 +774,12 @@ pub enum TenantMapInsertError {
///
/// NB: the closure should return quickly because the current implementation of tenants map
/// serializes access through an `RwLock`.
async fn tenant_map_insert<F, R>(
async fn tenant_map_insert<F>(
tenant_id: TenantId,
insert_fn: F,
) -> Result<Arc<Tenant>, TenantMapInsertError>
where
F: FnOnce() -> R,
R: std::future::Future<Output = anyhow::Result<Arc<Tenant>>>,
F: FnOnce() -> anyhow::Result<Arc<Tenant>>,
{
let mut guard = TENANTS.write().await;
let m = match &mut *guard {
@@ -774,7 +792,7 @@ where
tenant_id,
e.get().current_state(),
)),
hash_map::Entry::Vacant(v) => match insert_fn().await {
hash_map::Entry::Vacant(v) => match insert_fn() {
Ok(tenant) => {
v.insert(tenant.clone());
Ok(tenant)

View File

@@ -4,9 +4,10 @@ use std::{
sync::atomic::{AtomicUsize, Ordering},
};
use crate::virtual_file::VirtualFile;
fn fsync_path(path: &Path) -> io::Result<()> {
// TODO use VirtualFile::fsync_all once we fully go async.
let file = std::fs::File::open(path)?;
let file = VirtualFile::open(path)?;
file.sync_all()
}

View File

@@ -56,9 +56,11 @@
//! # Consistency
//!
//! To have a consistent remote structure, it's important that uploads and
//! deletions are performed in the right order. For example, the index file
//! contains a list of layer files, so it must not be uploaded until all the
//! layer files that are in its list have been successfully uploaded.
//! deletions are performed in the right order. For example:
//! - the index file contains a list of layer files, so it must not be uploaded
//! until all the layer files that are in its list have been successfully uploaded.
//! - objects must be removed from the index before being deleted, and that updated
//! index must be written to remote storage before deleting the objects from remote storage.
//!
//! The contract between client and its user is that the user is responsible of
//! scheduling operations in an order that keeps the remote consistent as
@@ -70,10 +72,12 @@
//! correct order, and the client will parallelize the operations in a way that
//! is safe.
//!
//! The caller should be careful with deletion, though. They should not delete
//! local files that have been scheduled for upload but not yet finished uploading.
//! Otherwise the upload will fail. To wait for an upload to finish, use
//! the 'wait_completion' function (more on that later.)
//! The caller should be careful with deletion, though:
//! - they should not delete local files that have been scheduled for upload but
//! not yet finished uploading. Otherwise the upload will fail. To wait for an
//! upload to finish, use the 'wait_completion' function (more on that later.)
//! - they should not to remote deletions via DeletionQueue without waiting for
//! the latest metadata to upload via RemoteTimelineClient.
//!
//! All of this relies on the following invariants:
//!
@@ -200,12 +204,11 @@
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
mod delete;
mod download;
pub mod index;
mod upload;
use anyhow::Context;
use anyhow::{bail, Context};
use chrono::{NaiveDateTime, Utc};
// re-export these
pub use download::{is_temp_download_file, list_remote_timelines};
@@ -226,6 +229,7 @@ use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -234,8 +238,6 @@ use crate::metrics::{
use crate::task_mgr::shutdown_token;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::upload_queue::Delete;
use crate::tenant::TIMELINES_SEGMENT_NAME;
use crate::{
config::PageServerConf,
task_mgr,
@@ -245,6 +247,7 @@ use crate::{
tenant::upload_queue::{
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
},
tenant::TIMELINES_SEGMENT_NAME,
};
use utils::id::{TenantId, TimelineId};
@@ -342,12 +345,7 @@ impl RemoteTimelineClient {
) -> RemoteTimelineClient {
RemoteTimelineClient {
conf,
runtime: if cfg!(test) {
// remote_timeline_client.rs tests rely on current-thread runtime
tokio::runtime::Handle::current()
} else {
BACKGROUND_RUNTIME.handle().clone()
},
runtime: BACKGROUND_RUNTIME.handle().to_owned(),
tenant_id,
timeline_id,
generation,
@@ -459,6 +457,7 @@ impl RemoteTimelineClient {
);
let index_part = download::download_index_part(
self.conf,
&self.storage_impl,
&self.tenant_id,
&self.timeline_id,
@@ -641,44 +640,36 @@ impl RemoteTimelineClient {
/// deletion won't actually be performed, until any previously scheduled
/// upload operations, and the index file upload, have completed
/// successfully.
pub fn schedule_layer_file_deletion(
pub async fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: &[LayerFileName],
deletion_queue_client: &DeletionQueueClient,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
// Synchronous update of upload queues under mutex
let with_generations = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
// Deleting layers doesn't affect the values stored in TimelineMetadata,
// so we don't need update it. Just serialize it.
let metadata = upload_queue.latest_metadata.clone();
// Deleting layers doesn't affect the values stored in TimelineMetadata,
// so we don't need update it. Just serialize it.
let metadata = upload_queue.latest_metadata.clone();
// Update the remote index file, removing the to-be-deleted files from the index,
// before deleting the actual files.
//
// Once we start removing files from upload_queue.latest_files, there's
// no going back! Otherwise, some of the files would already be removed
// from latest_files, but not yet scheduled for deletion. Use a closure
// to syntactically forbid ? or bail! calls here.
let no_bail_here = || {
// Decorate our list of names with each name's generation, dropping
// makes that are unexpectedly missing from our metadata.
let with_generations: Vec<_> = names
.iter()
.into_iter()
.filter_map(|name| {
// Remove from latest_files, learning the file's remote generation in the process
let meta = upload_queue.latest_files.remove(name);
if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
Some((name, meta.generation))
Some((name.clone(), meta.generation))
} else {
// This can only happen if we forgot to to schedule the file upload
// before scheduling the delete. Log it because it is a rare/strange
// situation, and in case something is misbehaving, we'd like to know which
// layers experienced this.
info!(
"Deleting layer {name} not found in latest_files list, never uploaded?"
);
// This is unexpected: latest_files is meant to be kept up to
// date. We can't delete the layer if we have forgotten what
// generation it was in.
warn!("Deleting layer {name} not found in latest_files list");
None
}
})
@@ -688,23 +679,27 @@ impl RemoteTimelineClient {
self.schedule_index_upload(upload_queue, metadata);
}
// schedule the actual deletions
for (name, generation) in with_generations {
let op = UploadOp::Delete(Delete {
file_kind: RemoteOpFileKind::Layer,
layer_file_name: name.clone(),
scheduled_from_timeline_delete: false,
generation,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!("scheduled layer file deletion {name}");
}
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
with_generations
};
no_bail_here();
// Barrier: we must ensure all prior uploads and index writes have landed in S3
// before emitting deletions.
if let Err(e) = self.wait_completion().await {
// This can only fail if upload queue is shut down: if this happens, we do
// not emit any deletions. In this condition (remote client is shut down
// during compaction or GC) we may leak some objects.
bail!("Cannot complete layer file deletions during shutdown ({e})");
}
// Enqueue deletions
deletion_queue_client
.push_layers(
self.tenant_id,
self.timeline_id,
self.generation,
with_generations,
)
.await?;
Ok(())
}
@@ -830,12 +825,13 @@ impl RemoteTimelineClient {
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
/// deletes leaked files if any and proceeds with deletion of index file at the end.
pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
pub(crate) async fn delete_all(
self: &Arc<Self>,
deletion_queue: &DeletionQueueClient,
) -> anyhow::Result<()> {
debug_assert_current_span_has_tenant_and_timeline_id();
let (mut receiver, deletions_queued) = {
let mut deletions_queued = 0;
let layers: Vec<_> = {
let mut locked = self.upload_queue.lock().unwrap();
let stopped = locked.stopped_mut()?;
@@ -847,42 +843,30 @@ impl RemoteTimelineClient {
stopped
.upload_queue_for_deletion
.queued_operations
.reserve(stopped.upload_queue_for_deletion.latest_files.len());
// schedule the actual deletions
for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
let op = UploadOp::Delete(Delete {
file_kind: RemoteOpFileKind::Layer,
layer_file_name: name.clone(),
scheduled_from_timeline_delete: true,
generation: meta.generation,
});
self.calls_unfinished_metric_begin(&op);
stopped
.upload_queue_for_deletion
.queued_operations
.push_back(op);
info!("scheduled layer file deletion {name}");
deletions_queued += 1;
}
self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
(
self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
deletions_queued,
)
.latest_files
.drain()
.map(|kv| (kv.0, kv.1.generation))
.collect()
};
receiver.changed().await.context("upload queue shut down")?;
let layer_deletion_count = layers.len();
let layer_paths = layers
.into_iter()
.map(|(layer, generation)| {
remote_layer_path(&self.tenant_id, &self.timeline_id, &layer, generation)
})
.collect();
deletion_queue.push_immediate(layer_paths).await?;
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
// Execute all pending deletions, so that when we prroceed to do a list_prefixes below, we aren't
// taking the burden of listing all the layers that we already know we should delete.
deletion_queue.flush_immediate().await?;
let remaining = backoff::retry(
|| async {
self.storage_impl
@@ -910,17 +894,9 @@ impl RemoteTimelineClient {
})
.collect();
let not_referenced_count = remaining.len();
if !remaining.is_empty() {
backoff::retry(
|| async { self.storage_impl.delete_objects(&remaining).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_objects",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
)
.await
.context("delete_objects")?;
deletion_queue.push_immediate(remaining).await?;
}
fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -931,18 +907,14 @@ impl RemoteTimelineClient {
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
debug!("deleting index part");
debug!("enqueuing index part deletion");
deletion_queue
.push_immediate([index_file_path].to_vec())
.await?;
backoff::retry(
|| async { self.storage_impl.delete(&index_file_path).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_index",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
)
.await
.context("delete_index")?;
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
deletion_queue.flush_immediate().await?;
fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(
@@ -950,7 +922,7 @@ impl RemoteTimelineClient {
))?
});
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
Ok(())
}
@@ -973,10 +945,6 @@ impl RemoteTimelineClient {
// have finished.
upload_queue.inprogress_tasks.is_empty()
}
UploadOp::Delete(_) => {
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
}
UploadOp::Barrier(_) => upload_queue.inprogress_tasks.is_empty(),
};
@@ -1004,9 +972,6 @@ impl RemoteTimelineClient {
UploadOp::UploadMetadata(_, _) => {
upload_queue.num_inprogress_metadata_uploads += 1;
}
UploadOp::Delete(_) => {
upload_queue.num_inprogress_deletions += 1;
}
UploadOp::Barrier(sender) => {
sender.send_replace(());
continue;
@@ -1140,21 +1105,6 @@ impl RemoteTimelineClient {
}
res
}
UploadOp::Delete(delete) => {
let path = &self
.conf
.timeline_path(&self.tenant_id, &self.timeline_id)
.join(delete.layer_file_name.file_name());
delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
delete.file_kind,
RemoteOpKind::Delete,
Arc::clone(&self.metrics),
)
.await
}
UploadOp::Barrier(_) => {
// unreachable. Barrier operations are handled synchronously in
// launch_queued_tasks
@@ -1214,15 +1164,7 @@ impl RemoteTimelineClient {
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
let upload_queue = match upload_queue_guard.deref_mut() {
UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
UploadQueue::Stopped(stopped) => {
// Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
// then stop() took care of it so we just return.
// For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
match &task.op {
UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
_ => None
}
},
UploadQueue::Stopped(_) => { None }
UploadQueue::Initialized(qi) => { Some(qi) }
};
@@ -1244,9 +1186,6 @@ impl RemoteTimelineClient {
upload_queue.num_inprogress_metadata_uploads -= 1;
upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
}
UploadOp::Delete(_) => {
upload_queue.num_inprogress_deletions -= 1;
}
UploadOp::Barrier(_) => unreachable!(),
};
@@ -1278,13 +1217,6 @@ impl RemoteTimelineClient {
reason: "metadata uploads are tiny",
},
),
UploadOp::Delete(delete) => (
delete.file_kind,
RemoteOpKind::Delete,
DontTrackSize {
reason: "should we track deletes? positive or negative sign?",
},
),
UploadOp::Barrier(_) => {
// we do not account these
return None;
@@ -1344,7 +1276,6 @@ impl RemoteTimelineClient {
last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::default(),
queued_operations: VecDeque::default(),
};
@@ -1365,9 +1296,7 @@ impl RemoteTimelineClient {
// consistency check
assert_eq!(
qi.num_inprogress_layer_uploads
+ qi.num_inprogress_metadata_uploads
+ qi.num_inprogress_deletions,
qi.num_inprogress_layer_uploads + qi.num_inprogress_metadata_uploads,
qi.inprogress_tasks.len()
);
@@ -1405,13 +1334,13 @@ pub fn remote_layer_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
layer_file_name: &LayerFileName,
layer_meta: &LayerFileMetadata,
generation: Generation,
) -> RemotePath {
// Generation-aware key format
let path = format!(
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
layer_file_name.file_name(),
layer_meta.generation.get_suffix()
generation.get_suffix()
);
RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1430,30 +1359,6 @@ pub fn remote_index_path(
.expect("Failed to construct path")
}
/// Given the key of an index, parse out the generation part of the name
pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
let file_name = match path.get_path().file_name() {
Some(f) => f,
None => {
// Unexpected: we should be seeing index_part.json paths only
tracing::warn!("Malformed index key {}", path);
return None;
}
};
let file_name_str = match file_name.to_str() {
Some(s) => s,
None => {
tracing::warn!("Malformed index key {:?}", path);
return None;
}
};
match file_name_str.split_once('-') {
Some((_, gen_suffix)) => Generation::parse_suffix(gen_suffix),
None => None,
}
}
/// Files on the remote storage are stored with paths, relative to the workdir.
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
///
@@ -1461,21 +1366,25 @@ pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
pub fn remote_path(
conf: &PageServerConf,
local_path: &Path,
generation: Generation,
generation: Option<Generation>,
) -> anyhow::Result<RemotePath> {
let stripped = local_path
.strip_prefix(&conf.workdir)
.context("Failed to strip workdir prefix")?;
let suffixed = format!(
"{0}{1}",
stripped.to_string_lossy(),
generation.get_suffix()
);
let suffixed = if let Some(generation) = generation {
format!(
"{0}{1}",
stripped.to_string_lossy(),
generation.get_suffix()
)
} else {
stripped.to_string_lossy().to_string()
};
RemotePath::new(&PathBuf::from(suffixed)).with_context(|| {
format!(
"to resolve remote part of path {:?} for base {:?}",
"Failed to resolve remote part of path {:?} for base {:?}",
local_path, conf.workdir
)
})
@@ -1486,14 +1395,18 @@ mod tests {
use super::*;
use crate::{
context::RequestContext,
deletion_queue::mock::MockDeletionQueue,
tenant::{
harness::{TenantHarness, TIMELINE_ID},
Generation, Tenant, Timeline,
},
DEFAULT_PG_VERSION,
};
use std::{collections::HashSet, path::Path};
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
use std::{
collections::HashSet,
path::{Path, PathBuf},
};
use utils::lsn::Lsn;
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1550,6 +1463,9 @@ mod tests {
tenant: Arc<Tenant>,
timeline: Arc<Timeline>,
tenant_ctx: RequestContext,
remote_fs_dir: PathBuf,
client: Arc<RemoteTimelineClient>,
deletion_queue: MockDeletionQueue,
}
impl TestSetup {
@@ -1559,44 +1475,57 @@ mod tests {
let harness = TenantHarness::create(test_name)?;
let (tenant, ctx) = harness.load().await;
// create an empty timeline directory
let timeline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
std::fs::create_dir_all(remote_fs_dir)?;
let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
let storage_config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
)
.unwrap(),
max_sync_errors: std::num::NonZeroU32::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
)
.unwrap(),
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let generation = Generation::new(0xdeadbeef);
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
let client = Arc::new(RemoteTimelineClient {
conf: harness.conf,
runtime: tokio::runtime::Handle::current(),
tenant_id: harness.tenant_id,
timeline_id: TIMELINE_ID,
generation,
storage_impl: storage.clone(),
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&harness.tenant_id,
&TIMELINE_ID,
)),
});
let deletion_queue = MockDeletionQueue::new(Some(storage));
Ok(Self {
harness,
tenant,
timeline,
tenant_ctx: ctx,
remote_fs_dir,
client,
deletion_queue,
})
}
/// Construct a RemoteTimelineClient in an arbitrary generation
fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
Arc::new(RemoteTimelineClient {
conf: self.harness.conf,
runtime: tokio::runtime::Handle::current(),
tenant_id: self.harness.tenant_id,
timeline_id: TIMELINE_ID,
generation,
storage_impl: self.harness.remote_storage.clone(),
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&self.harness.tenant_id,
&TIMELINE_ID,
)),
})
}
/// A tracing::Span that satisfies remote_timeline_client methods that assert tenant_id
/// and timeline_id are present.
fn span(&self) -> tracing::Span {
tracing::info_span!(
"test",
tenant_id = %self.harness.tenant_id,
timeline_id = %TIMELINE_ID
)
}
}
// Test scheduling
@@ -1616,44 +1545,30 @@ mod tests {
// Schedule another deletion. Check that it's launched immediately.
// Schedule index upload. Check that it's queued
let test_setup = TestSetup::new("upload_scheduling").await.unwrap();
let span = test_setup.span();
let _guard = span.enter();
let TestSetup {
harness,
tenant: _tenant,
timeline,
timeline: _timeline,
tenant_ctx: _tenant_ctx,
} = test_setup;
let client = timeline.remote_client.as_ref().unwrap();
// Download back the index.json, and check that the list of files is correct
let initial_index_part = match client.download_index_file().await.unwrap() {
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
};
let initial_layers = initial_index_part
.layer_metadata
.keys()
.map(|f| f.to_owned())
.collect::<HashSet<LayerFileName>>();
let initial_layer = {
assert!(initial_layers.len() == 1);
initial_layers.into_iter().next().unwrap()
};
remote_fs_dir,
client,
deletion_queue,
} = TestSetup::new("upload_scheduling").await.unwrap();
let timeline_path = harness.timeline_path(&TIMELINE_ID);
println!("workdir: {}", harness.conf.workdir.display());
let remote_timeline_dir = harness
.remote_fs_dir
.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
let remote_timeline_dir =
remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
println!("remote_timeline_dir: {}", remote_timeline_dir.display());
let generation = harness.generation;
let metadata = dummy_metadata(Lsn(0x10));
client
.init_upload_queue_for_empty_remote(&metadata)
.unwrap();
let generation = Generation::new(0xdeadbeef);
// Create a couple of dummy files, schedule upload for them
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1734,7 +1649,6 @@ mod tests {
.map(|f| f.to_owned())
.collect(),
&[
&initial_layer.file_name(),
&layer_file_name_1.file_name(),
&layer_file_name_2.file_name(),
],
@@ -1748,23 +1662,16 @@ mod tests {
&LayerFileMetadata::new(content_3.len() as u64, generation),
)
.unwrap();
client
.schedule_layer_file_deletion(&[layer_file_name_1.clone()])
.unwrap();
{
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
// Deletion schedules upload of the index file, and the file deletion itself
assert!(upload_queue.queued_operations.len() == 2);
assert!(upload_queue.inprogress_tasks.len() == 1);
assert!(upload_queue.num_inprogress_layer_uploads == 1);
assert!(upload_queue.num_inprogress_deletions == 0);
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
assert_eq!(upload_queue.queued_operations.len(), 0);
assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
}
assert_remote_files(
&[
&initial_layer.file_name(),
&layer_file_name_1.file_name(),
&layer_file_name_2.file_name(),
"index_part.json",
@@ -1773,12 +1680,50 @@ mod tests {
generation,
);
// Finish them
client
.schedule_layer_file_deletion(
&[layer_file_name_1.clone()],
&deletion_queue.new_client(),
)
.await
.unwrap();
{
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
// Deletion schedules upload of the index file via RemoteTimelineClient, and
// deletion of layer files via DeletionQueue. The uploads have all been flushed
// because schedule_layer_file_deletion does a wait_completion before pushing
// to the deletion_queue
assert_eq!(upload_queue.queued_operations.len(), 0);
assert_eq!(upload_queue.inprogress_tasks.len(), 0);
assert_eq!(upload_queue.num_inprogress_layer_uploads, 0);
assert_eq!(
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
0
);
}
assert_remote_files(
&[
&layer_file_name_1.file_name(),
&layer_file_name_2.file_name(),
&layer_file_name_3.file_name(),
"index_part.json",
],
&remote_timeline_dir,
generation,
);
// Finish uploads and deletions
client.wait_completion().await.unwrap();
deletion_queue.pump().await;
// 1 layer was deleted
assert_eq!(deletion_queue.get_executed(), 1);
assert_remote_files(
&[
&initial_layer.file_name(),
&layer_file_name_2.file_name(),
&layer_file_name_3.file_name(),
"index_part.json",
@@ -1795,10 +1740,16 @@ mod tests {
let TestSetup {
harness,
tenant: _tenant,
timeline,
timeline: _timeline,
client,
..
} = TestSetup::new("metrics").await.unwrap();
let client = timeline.remote_client.as_ref().unwrap();
let metadata = dummy_metadata(Lsn(0x10));
client
.init_upload_queue_for_empty_remote(&metadata)
.unwrap();
let timeline_path = harness.timeline_path(&TIMELINE_ID);
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1809,20 +1760,11 @@ mod tests {
)
.unwrap();
#[derive(Debug, PartialEq, Clone, Copy)]
#[derive(Debug, PartialEq)]
struct BytesStartedFinished {
started: Option<usize>,
finished: Option<usize>,
}
impl std::ops::Add for BytesStartedFinished {
type Output = Self;
fn add(self, rhs: Self) -> Self::Output {
Self {
started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
}
}
}
let get_bytes_started_stopped = || {
let started = client
.metrics
@@ -1839,140 +1781,66 @@ mod tests {
};
// Test
tracing::info!("now doing actual test");
let actual_a = get_bytes_started_stopped();
let generation = Generation::new(0xdeadbeef);
let init = get_bytes_started_stopped();
client
.schedule_layer_file_upload(
&layer_file_name_1,
&LayerFileMetadata::new(content_1.len() as u64, harness.generation),
&LayerFileMetadata::new(content_1.len() as u64, generation),
)
.unwrap();
let actual_b = get_bytes_started_stopped();
let pre = get_bytes_started_stopped();
client.wait_completion().await.unwrap();
let actual_c = get_bytes_started_stopped();
let post = get_bytes_started_stopped();
// Validate
let expected_b = actual_a
+ BytesStartedFinished {
assert_eq!(
init,
BytesStartedFinished {
started: None,
finished: None
}
);
assert_eq!(
pre,
BytesStartedFinished {
started: Some(content_1.len()),
// assert that the _finished metric is created eagerly so that subtractions work on first sample
finished: Some(0),
};
assert_eq!(actual_b, expected_b);
let expected_c = actual_a
+ BytesStartedFinished {
started: Some(content_1.len()),
finished: Some(content_1.len()),
};
assert_eq!(actual_c, expected_c);
}
async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
// An empty IndexPart, just sufficient to ensure deserialization will succeed
let example_metadata = TimelineMetadata::example();
let example_index_part = IndexPart::new(
HashMap::new(),
example_metadata.disk_consistent_lsn(),
example_metadata,
);
let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
timeline_path
.strip_prefix(&test_state.harness.conf.workdir)
.unwrap(),
);
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
let index_path = test_state.harness.remote_fs_dir.join(
remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
);
eprintln!("Writing {}", index_path.display());
std::fs::write(&index_path, index_part_bytes).unwrap();
example_index_part
}
/// Assert that when a RemoteTimelineclient in generation `get_generation` fetches its
/// index, the IndexPart returned is equal to `expected`
async fn assert_got_index_part(
test_state: &TestSetup,
get_generation: Generation,
expected: &IndexPart,
) {
let client = test_state.build_client(get_generation);
let download_r = client
.download_index_file()
.await
.expect("download should always succeed");
assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
match download_r {
MaybeDeletedIndexPart::IndexPart(index_part) => {
assert_eq!(&index_part, expected);
}
MaybeDeletedIndexPart::Deleted(_index_part) => panic!("Test doesn't set deleted_at"),
}
);
assert_eq!(
post,
BytesStartedFinished {
started: Some(content_1.len()),
finished: Some(content_1.len())
}
);
}
#[tokio::test]
async fn index_part_download_simple() -> anyhow::Result<()> {
let test_state = TestSetup::new("index_part_download_simple").await.unwrap();
let span = test_state.span();
let _guard = span.enter();
// #[tokio::test]
// async fn index_part_download() {
// let TestSetup {
// harness,
// tenant: _tenant,
// timeline: _timeline,
// client,
// ..
// } = TestSetup::new("index_part_download").await.unwrap();
// Simple case: we are in generation N, load the index from generation N - 1
let generation_n = 5;
let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
// let example_index_part = IndexPart {
// version: 3,
// timeline_layers: HashSet::new(),
// layer_metadata:
assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await;
// }
Ok(())
}
#[tokio::test]
async fn index_part_download_ordering() -> anyhow::Result<()> {
let test_state = TestSetup::new("index_part_download_ordering")
.await
.unwrap();
let span = test_state.span();
let _guard = span.enter();
// A generation-less IndexPart exists in the bucket, we should find it
let generation_n = 5;
let injected_none = inject_index_part(&test_state, Generation::none()).await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await;
// If a more recent-than-none generation exists, we should prefer to load that
let injected_1 = inject_index_part(&test_state, Generation::new(1)).await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
// If a more-recent-than-me generation exists, we should ignore it.
let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
// If a directly previous generation exists, _and_ an index exists in my own
// generation, I should prefer my own generation.
let _injected_prev =
inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await;
assert_got_index_part(
&test_state,
Generation::new(generation_n),
&injected_current,
)
.await;
Ok(())
}
// }
}

View File

@@ -1,34 +0,0 @@
//! Helper functions to delete files from remote storage with a RemoteStorage
use anyhow::Context;
use std::path::Path;
use tracing::debug;
use remote_storage::GenericRemoteStorage;
use crate::{
config::PageServerConf,
tenant::{remote_timeline_client::remote_path, Generation},
};
pub(super) async fn delete_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
local_layer_path: &'a Path,
generation: Generation,
) -> anyhow::Result<()> {
fail::fail_point!("before-delete-layer", |_| {
anyhow::bail!("failpoint before-delete-layer")
});
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
let path_to_delete = remote_path(conf, local_layer_path, generation)?;
// We don't want to print an error if the delete failed if the file has
// already been deleted. Thankfully, in this situation S3 already
// does not yield an error. While OS-provided local file system APIs do yield
// errors, we avoid them in the `LocalFs` wrapper.
storage
.delete(&path_to_delete)
.await
.with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
}

View File

@@ -19,15 +19,12 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::Generation;
use remote_storage::{DownloadError, GenericRemoteStorage};
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use super::index::{IndexPart, LayerFileMetadata};
use super::{
parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
};
use super::{remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
@@ -50,7 +47,12 @@ pub async fn download_layer_file<'a>(
.timeline_path(&tenant_id, &timeline_id)
.join(layer_file_name.file_name());
let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
let remote_path = remote_layer_path(
&tenant_id,
&timeline_id,
layer_file_name,
layer_metadata.generation,
);
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:
@@ -67,43 +69,33 @@ pub async fn download_layer_file<'a>(
let (mut destination_file, bytes_amount) = download_retry(
|| async {
// TODO: this doesn't use the cached fd for some reason?
let mut destination_file = fs::File::create(&temp_file_path)
.await
.with_context(|| {
format!(
"create a destination file for layer '{}'",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
let mut download = storage
.download(&remote_path)
.await
.with_context(|| {
format!(
"open a download stream for layer with remote storage path '{remote_path:?}'"
)
})
.map_err(DownloadError::Other)?;
let bytes_amount = tokio::time::timeout(
MAX_DOWNLOAD_DURATION,
tokio::io::copy(&mut download.download_stream, &mut destination_file),
)
.await
.map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))?
.with_context(|| {
let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
format!(
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
"create a destination file for layer '{}'",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
let mut download = storage.download(&remote_path).await.with_context(|| {
format!(
"open a download stream for layer with remote storage path '{remote_path:?}'"
)
})
.map_err(DownloadError::Other)?;
let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file))
.await
.map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))?
.with_context(|| {
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
})
.map_err(DownloadError::Other)?;
Ok((destination_file, bytes_amount))
},
&format!("download {remote_path:?}"),
)
.await?;
).await?;
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
// A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -116,7 +108,12 @@ pub async fn download_layer_file<'a>(
destination_file
.flush()
.await
.with_context(|| format!("flush source file at {}", temp_file_path.display()))
.with_context(|| {
format!(
"failed to flush source file at {}",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
let expected = layer_metadata.file_size();
@@ -147,12 +144,17 @@ pub async fn download_layer_file<'a>(
fs::rename(&temp_file_path, &local_path)
.await
.with_context(|| format!("rename download layer file to {}", local_path.display(),))
.with_context(|| {
format!(
"Could not rename download layer file to {}",
local_path.display(),
)
})
.map_err(DownloadError::Other)?;
crashsafe::fsync_async(&local_path)
.await
.with_context(|| format!("fsync layer file {}", local_path.display(),))
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
.map_err(DownloadError::Other)?;
tracing::debug!("download complete: {}", local_path.display());
@@ -203,9 +205,9 @@ pub async fn list_remote_timelines(
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
})?;
let timeline_id: TimelineId = object_name
.parse()
.with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
let timeline_id: TimelineId = object_name.parse().with_context(|| {
format!("failed to parse object name into timeline id '{object_name}'")
})?;
// list_prefixes is assumed to return unique names. Ensure this here.
// NB: it's safer to bail out than warn-log this because the pageserver
@@ -223,6 +225,7 @@ pub async fn list_remote_timelines(
}
async fn do_download_index_part(
local_path: &Path,
storage: &GenericRemoteStorage,
tenant_id: &TenantId,
timeline_id: &TimelineId,
@@ -231,92 +234,83 @@ async fn do_download_index_part(
let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
let index_part_bytes = download_retry(
|| async {
let mut index_part_download = storage.download(&remote_path).await?;
let mut index_part_bytes = Vec::new();
tokio::io::copy(
&mut index_part_download.download_stream,
&mut index_part_bytes,
)
.await
.with_context(|| format!("download index part at {remote_path:?}"))
.map_err(DownloadError::Other)?;
Ok(index_part_bytes)
},
|| storage.download_all(&remote_path),
&format!("download {remote_path:?}"),
)
.await?;
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
.with_context(|| format!("download index part file at {remote_path:?}"))
.with_context(|| format!("Failed to deserialize index part file into file {local_path:?}"))
.map_err(DownloadError::Other)?;
Ok(index_part)
}
/// index_part.json objects are suffixed with a generation number, so we cannot
/// directly GET the latest index part without doing some probing.
///
/// In this function we probe for the most recent index in a generation <= our current generation.
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
pub(super) async fn download_index_part(
conf: &'static PageServerConf,
storage: &GenericRemoteStorage,
tenant_id: &TenantId,
timeline_id: &TimelineId,
my_generation: Generation,
) -> Result<IndexPart, DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
let local_path = conf
.metadata_path(tenant_id, timeline_id)
.with_file_name(IndexPart::FILE_NAME);
if my_generation.is_none() {
// Operating without generations: just fetch the generation-less path
return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
return do_download_index_part(&local_path, storage, tenant_id, timeline_id, my_generation)
.await;
}
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
// index in our generation.
//
// This is an optimization to avoid doing the listing for the general case below.
let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
match res {
Ok(index_part) => {
tracing::debug!(
"Found index_part from current generation (this is a stale attachment)"
);
return Ok(index_part);
}
Err(DownloadError::NotFound) => {}
Err(e) => return Err(e),
};
let previous_gen = my_generation.previous();
let r_previous =
do_download_index_part(&local_path, storage, tenant_id, timeline_id, previous_gen).await;
// Typical case: the previous generation of this tenant was running healthily, and had uploaded
// and index part. We may safely start from this index without doing a listing, because:
// - We checked for current generation case above
// - generations > my_generation are to be ignored
// - any other indices that exist would have an older generation than `previous_gen`, and
// we want to find the most recent index from a previous generation.
//
// This is an optimization to avoid doing the listing for the general case below.
let res =
do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
match res {
match r_previous {
Ok(index_part) => {
tracing::debug!("Found index_part from previous generation");
tracing::debug!("Found index_part from previous generation {previous_gen}");
return Ok(index_part);
}
Err(DownloadError::NotFound) => {
tracing::debug!(
"No index_part found from previous generation, falling back to listing"
);
}
Err(e) => {
return Err(e);
if matches!(e, DownloadError::NotFound) {
tracing::debug!("No index_part found from previous generation {previous_gen}, falling back to listing");
} else {
return Err(e);
}
}
};
/// Given the key of an index, parse out the generation part of the name
fn parse_generation(path: RemotePath) -> Option<Generation> {
let path = path.take();
let file_name = match path.file_name() {
Some(f) => f,
None => {
// Unexpected: we should be seeing index_part.json paths only
tracing::warn!("Malformed index key {0}", path.display());
return None;
}
};
let file_name_str = match file_name.to_str() {
Some(s) => s,
None => {
tracing::warn!("Malformed index key {0}", path.display());
return None;
}
};
match file_name_str.split_once("-") {
Some((_, gen_suffix)) => u32::from_str_radix(gen_suffix, 16)
.map(|g| Generation::new(g))
.ok(),
None => None,
}
}
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
// objects, and select the highest one with a generation <= my_generation.
// Fallback: we did not find an index_part.json from the previous generation, so
// we will list all the index_part objects and pick the most recent.
let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
let indices = backoff::retry(
|| async { storage.list_files(Some(&index_prefix)).await },
@@ -330,26 +324,38 @@ pub(super) async fn download_index_part(
}),
)
.await
.map_err(DownloadError::Other)?;
.map_err(|e| DownloadError::Other(e))?;
// General case logic for which index to use: the latest index whose generation
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
let max_previous_generation = indices
let mut generations: Vec<_> = indices
.into_iter()
.filter_map(parse_remote_index_path)
.filter_map(|k| parse_generation(k))
.filter(|g| g <= &my_generation)
.max();
.collect();
match max_previous_generation {
generations.sort();
match generations.last() {
Some(g) => {
tracing::debug!("Found index_part in generation {g:?}");
do_download_index_part(storage, tenant_id, timeline_id, g).await
tracing::debug!("Found index_part in generation {g} (my generation {my_generation})");
do_download_index_part(&local_path, storage, tenant_id, timeline_id, *g).await
}
None => {
// Migration from legacy pre-generation state: we have a generation but no prior
// attached pageservers did. Try to load from a no-generation path.
tracing::info!("No index_part.json* found");
do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
// This is not an error: the timeline may be newly created, or we may be
// upgrading and have no historical index_part with a generation suffix.
// Fall back to trying to load the un-suffixed index_part.json.
tracing::info!(
"No index_part.json-* found when loading {}/{} in generation {}",
tenant_id,
timeline_id,
my_generation
);
return do_download_index_part(
&local_path,
storage,
tenant_id,
timeline_id,
Generation::none(),
)
.await;
}
}
}

View File

@@ -2,7 +2,7 @@
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
//! remote timeline layers and its metadata.
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use chrono::NaiveDateTime;
use serde::{Deserialize, Serialize};
@@ -69,6 +69,10 @@ pub struct IndexPart {
#[serde(skip_serializing_if = "Option::is_none")]
pub deleted_at: Option<NaiveDateTime>,
/// Legacy field: equal to the keys of `layer_metadata`, only written out for forward compat
#[serde(default, skip_deserializing)]
timeline_layers: HashSet<LayerFileName>,
/// Per layer file name metadata, which can be present for a present or missing layer file.
///
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -94,12 +98,7 @@ impl IndexPart {
/// - 2: added `deleted_at`
/// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
/// is always generated from the keys of `layer_metadata`)
/// - 4: timeline_layers is fully removed.
const LATEST_VERSION: usize = 4;
// Versions we may see when reading from a bucket.
pub const KNOWN_VERSIONS: &[usize] = &[1, 2, 3, 4];
const LATEST_VERSION: usize = 3;
pub const FILE_NAME: &'static str = "index_part.json";
pub fn new(
@@ -107,30 +106,24 @@ impl IndexPart {
disk_consistent_lsn: Lsn,
metadata: TimelineMetadata,
) -> Self {
// Transform LayerFileMetadata into IndexLayerMetadata
let layer_metadata = layers_and_metadata
.into_iter()
.map(|(k, v)| (k, IndexLayerMetadata::from(v)))
.collect();
let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
for (remote_name, metadata) in &layers_and_metadata {
timeline_layers.insert(remote_name.to_owned());
let metadata = IndexLayerMetadata::from(metadata);
layer_metadata.insert(remote_name.to_owned(), metadata);
}
Self {
version: Self::LATEST_VERSION,
timeline_layers,
layer_metadata,
disk_consistent_lsn,
metadata,
deleted_at: None,
}
}
pub fn get_version(&self) -> usize {
self.version
}
/// If you want this under normal operations, read it from self.metadata:
/// this method is just for the scrubber to use when validating an index.
pub fn get_disk_consistent_lsn(&self) -> Lsn {
self.disk_consistent_lsn
}
}
impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -151,15 +144,15 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {
/// Serialized form of [`LayerFileMetadata`].
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
pub struct IndexLayerMetadata {
pub file_size: u64,
pub(super) file_size: u64,
#[serde(default = "Generation::none")]
#[serde(skip_serializing_if = "Generation::is_none")]
pub(super) generation: Generation,
}
impl From<LayerFileMetadata> for IndexLayerMetadata {
fn from(other: LayerFileMetadata) -> Self {
impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
fn from(other: &'_ LayerFileMetadata) -> Self {
IndexLayerMetadata {
file_size: other.file_size,
generation: other.generation,
@@ -187,6 +180,7 @@ mod tests {
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
timeline_layers: HashSet::new(),
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
@@ -225,6 +219,7 @@ mod tests {
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
timeline_layers: HashSet::new(),
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
@@ -264,6 +259,7 @@ mod tests {
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 2,
timeline_layers: HashSet::new(),
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
@@ -298,6 +294,7 @@ mod tests {
let expected = IndexPart {
version: 1,
timeline_layers: HashSet::new(),
layer_metadata: HashMap::new(),
disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[
@@ -330,41 +327,4 @@ mod tests {
assert_eq!(empty_layers_parsed, expected);
}
#[test]
fn v4_indexpart_is_parsed() {
let example = r#"{
"version":4,
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
"deleted_at": "2023-07-31T09:00:00.123"
}"#;
let expected = IndexPart {
version: 4,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
generation: Generation::none()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
};
let part = serde_json::from_str::<IndexPart>(example).unwrap();
assert_eq!(part, expected);
}
}

View File

@@ -31,8 +31,8 @@ pub(super) async fn upload_index_part<'a>(
bail!("failpoint before-upload-index")
});
let index_part_bytes =
serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
let index_part_bytes = serde_json::to_vec(&index_part)
.context("Failed to serialize index part file into bytes")?;
let index_part_size = index_part_bytes.len();
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
@@ -40,7 +40,7 @@ pub(super) async fn upload_index_part<'a>(
storage
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
.await
.with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'"))
.with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
}
/// Attempts to upload given layer files.
@@ -58,7 +58,7 @@ pub(super) async fn upload_timeline_layer<'a>(
bail!("failpoint before-upload-layer")
});
let storage_path = remote_path(conf, source_path, generation)?;
let storage_path = remote_path(conf, source_path, Some(generation))?;
let source_file_res = fs::File::open(&source_path).await;
let source_file = match source_file_res {
Ok(source_file) => source_file,
@@ -71,15 +71,16 @@ pub(super) async fn upload_timeline_layer<'a>(
info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
return Ok(());
}
Err(e) => {
Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))?
}
Err(e) => Err(e)
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?,
};
let fs_size = source_file
.metadata()
.await
.with_context(|| format!("get the source file metadata for layer {source_path:?}"))?
.with_context(|| {
format!("Failed to get the source file metadata for layer {source_path:?}")
})?
.len();
let metadata_size = known_metadata.file_size();
@@ -87,13 +88,19 @@ pub(super) async fn upload_timeline_layer<'a>(
bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
}
let fs_size = usize::try_from(fs_size)
.with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
let fs_size = usize::try_from(fs_size).with_context(|| {
format!("File {source_path:?} size {fs_size} could not be converted to usize")
})?;
storage
.upload(source_file, fs_size, &storage_path, None)
.await
.with_context(|| format!("upload layer from local path '{}'", source_path.display()))?;
.with_context(|| {
format!(
"Failed to upload a layer from local path '{}'",
source_path.display()
)
})?;
Ok(())
}

View File

@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{
@@ -45,7 +45,8 @@ use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::fs::{self, File};
use std::io::SeekFrom;
use std::io::{BufWriter, Write};
use std::io::{Seek, SeekFrom};
use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
@@ -218,7 +219,7 @@ pub struct DeltaLayerInner {
index_root_blk: u32,
/// Reader object for reading blocks from the file.
file: FileBlockReader,
file: FileBlockReader<VirtualFile>,
}
impl AsRef<DeltaLayerInner> for DeltaLayerInner {
@@ -582,14 +583,14 @@ struct DeltaLayerWriterInner {
tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
blob_writer: BlobWriter<true>,
blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
}
impl DeltaLayerWriterInner {
///
/// Start building a new delta layer.
///
async fn new(
fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
@@ -604,10 +605,11 @@ impl DeltaLayerWriterInner {
// FIXME: throw an error instead?
let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);
let mut file = VirtualFile::create(&path).await?;
let mut file = VirtualFile::create(&path)?;
// make room for the header block
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
let buf_writer = BufWriter::new(file);
let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64);
// Initialize the b-tree index builder
let block_buf = BlockBuf::new();
@@ -630,12 +632,11 @@ impl DeltaLayerWriterInner {
///
/// The values must be appended in key, lsn order.
///
async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
.await
}
async fn put_value_bytes(
fn put_value_bytes(
&mut self,
key: Key,
lsn: Lsn,
@@ -644,7 +645,7 @@ impl DeltaLayerWriterInner {
) -> anyhow::Result<()> {
assert!(self.lsn_range.start <= lsn);
let off = self.blob_writer.write_blob(val).await?;
let off = self.blob_writer.write_blob(val)?;
let blob_ref = BlobRef::new(off, will_init);
@@ -661,18 +662,18 @@ impl DeltaLayerWriterInner {
///
/// Finish writing the delta layer.
///
async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
let mut file = self.blob_writer.into_inner().await?;
let buf_writer = self.blob_writer.into_inner();
let mut file = buf_writer.into_inner()?;
// Write out the index
let (index_root_blk, block_buf) = self.tree.finish()?;
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
.await?;
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
for buf in block_buf.blocks {
file.write_all(buf.as_ref()).await?;
file.write_all(buf.as_ref())?;
}
assert!(self.lsn_range.start < self.lsn_range.end);
// Fill in the summary on blk 0
@@ -686,22 +687,11 @@ impl DeltaLayerWriterInner {
index_start_blk,
index_root_blk,
};
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
Summary::ser_into(&summary, &mut buf)?;
if buf.spilled() {
// This is bad as we only have one free block for the summary
warn!(
"Used more than one page size for summary buffer: {}",
buf.len()
);
}
file.seek(SeekFrom::Start(0)).await?;
file.write_all(&buf).await?;
file.seek(SeekFrom::Start(0))?;
Summary::ser_into(&summary, &mut file)?;
let metadata = file
.metadata()
.await
.context("get file metadata to determine size")?;
// 5GB limit for objects without multipart upload (which we don't want to use)
@@ -732,7 +722,7 @@ impl DeltaLayerWriterInner {
};
// fsync the file
file.sync_all().await?;
file.sync_all()?;
// Rename the file to its final name
//
// Note: This overwrites any existing file. There shouldn't be any.
@@ -784,7 +774,7 @@ impl DeltaLayerWriter {
///
/// Start building a new delta layer.
///
pub async fn new(
pub fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
@@ -792,10 +782,13 @@ impl DeltaLayerWriter {
lsn_range: Range<Lsn>,
) -> anyhow::Result<Self> {
Ok(Self {
inner: Some(
DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range)
.await?,
),
inner: Some(DeltaLayerWriterInner::new(
conf,
timeline_id,
tenant_id,
key_start,
lsn_range,
)?),
})
}
@@ -804,11 +797,11 @@ impl DeltaLayerWriter {
///
/// The values must be appended in key, lsn order.
///
pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_value(key, lsn, val).await
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_value(key, lsn, val)
}
pub async fn put_value_bytes(
pub fn put_value_bytes(
&mut self,
key: Key,
lsn: Lsn,
@@ -819,7 +812,6 @@ impl DeltaLayerWriter {
.as_mut()
.unwrap()
.put_value_bytes(key, lsn, val, will_init)
.await
}
pub fn size(&self) -> u64 {
@@ -829,18 +821,21 @@ impl DeltaLayerWriter {
///
/// Finish writing the delta layer.
///
pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
self.inner.take().unwrap().finish(key_end).await
pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
self.inner.take().unwrap().finish(key_end)
}
}
impl Drop for DeltaLayerWriter {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
// We want to remove the virtual file here, so it's fine to not
// having completely flushed unwritten data.
let vfile = inner.blob_writer.into_inner_no_flush();
vfile.remove();
match inner.blob_writer.into_inner().into_inner() {
Ok(vfile) => vfile.remove(),
Err(err) => warn!(
"error while flushing buffer of image layer temporary file: {}",
err
),
}
}
}
}
@@ -851,7 +846,6 @@ impl DeltaLayerInner {
summary: Option<Summary>,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.await
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);

View File

@@ -212,7 +212,7 @@ pub enum LayerFileName {
}
impl LayerFileName {
pub fn file_name(&self) -> String {
pub(crate) fn file_name(&self) -> String {
self.to_string()
}

View File

@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{
@@ -42,7 +42,8 @@ use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::fs::{self, File};
use std::io::SeekFrom;
use std::io::Write;
use std::io::{Seek, SeekFrom};
use std::ops::Range;
use std::os::unix::prelude::FileExt;
use std::path::{Path, PathBuf};
@@ -154,7 +155,7 @@ pub struct ImageLayerInner {
lsn: Lsn,
/// Reader object for reading blocks from the file.
file: FileBlockReader,
file: FileBlockReader<VirtualFile>,
}
impl std::fmt::Debug for ImageLayerInner {
@@ -438,7 +439,6 @@ impl ImageLayerInner {
summary: Option<Summary>,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.await
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0).await?;
@@ -511,7 +511,7 @@ struct ImageLayerWriterInner {
key_range: Range<Key>,
lsn: Lsn,
blob_writer: BlobWriter<false>,
blob_writer: WriteBlobWriter<VirtualFile>,
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
}
@@ -519,7 +519,7 @@ impl ImageLayerWriterInner {
///
/// Start building a new image layer.
///
async fn new(
fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
@@ -541,11 +541,10 @@ impl ImageLayerWriterInner {
let mut file = VirtualFile::open_with_options(
&path,
std::fs::OpenOptions::new().write(true).create_new(true),
)
.await?;
)?;
// make room for the header block
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
// Initialize the b-tree index builder
let block_buf = BlockBuf::new();
@@ -570,9 +569,9 @@ impl ImageLayerWriterInner {
///
/// The page versions must be appended in blknum order.
///
async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
ensure!(self.key_range.contains(&key));
let off = self.blob_writer.write_blob(img).await?;
let off = self.blob_writer.write_blob(img)?;
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
@@ -584,18 +583,17 @@ impl ImageLayerWriterInner {
///
/// Finish writing the image layer.
///
async fn finish(self) -> anyhow::Result<ImageLayer> {
fn finish(self) -> anyhow::Result<ImageLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
let mut file = self.blob_writer.into_inner();
// Write out the index
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
.await?;
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
let (index_root_blk, block_buf) = self.tree.finish()?;
for buf in block_buf.blocks {
file.write_all(buf.as_ref()).await?;
file.write_all(buf.as_ref())?;
}
// Fill in the summary on blk 0
@@ -609,22 +607,11 @@ impl ImageLayerWriterInner {
index_start_blk,
index_root_blk,
};
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
Summary::ser_into(&summary, &mut buf)?;
if buf.spilled() {
// This is bad as we only have one free block for the summary
warn!(
"Used more than one page size for summary buffer: {}",
buf.len()
);
}
file.seek(SeekFrom::Start(0)).await?;
file.write_all(&buf).await?;
file.seek(SeekFrom::Start(0))?;
Summary::ser_into(&summary, &mut file)?;
let metadata = file
.metadata()
.await
.context("get metadata to determine file size")?;
let desc = PersistentLayerDesc::new_img(
@@ -647,7 +634,7 @@ impl ImageLayerWriterInner {
};
// fsync the file
file.sync_all().await?;
file.sync_all()?;
// Rename the file to its final name
//
@@ -700,7 +687,7 @@ impl ImageLayerWriter {
///
/// Start building a new image layer.
///
pub async fn new(
pub fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
@@ -708,9 +695,13 @@ impl ImageLayerWriter {
lsn: Lsn,
) -> anyhow::Result<ImageLayerWriter> {
Ok(Self {
inner: Some(
ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
),
inner: Some(ImageLayerWriterInner::new(
conf,
timeline_id,
tenant_id,
key_range,
lsn,
)?),
})
}
@@ -719,15 +710,15 @@ impl ImageLayerWriter {
///
/// The page versions must be appended in blknum order.
///
pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_image(key, img).await
pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_image(key, img)
}
///
/// Finish writing the image layer.
///
pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
self.inner.take().unwrap().finish().await
pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
self.inner.take().unwrap().finish()
}
}

View File

@@ -236,7 +236,7 @@ impl InMemoryLayer {
///
/// Create a new, empty, in-memory layer
///
pub async fn create(
pub fn create(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
@@ -244,7 +244,7 @@ impl InMemoryLayer {
) -> Result<InMemoryLayer> {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
let file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
Ok(InMemoryLayer {
conf,
@@ -333,8 +333,7 @@ impl InMemoryLayer {
self.tenant_id,
Key::MIN,
self.start_lsn..end_lsn,
)
.await?;
)?;
let mut buf = Vec::new();
@@ -349,13 +348,11 @@ impl InMemoryLayer {
for (lsn, pos) in vec_map.as_slice() {
cursor.read_blob_into_buf(*pos, &mut buf).await?;
let will_init = Value::des(&buf)?.will_init();
delta_layer_writer
.put_value_bytes(key, *lsn, &buf, will_init)
.await?;
delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
}
}
let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
let delta_layer = delta_layer_writer.finish(Key::MAX)?;
Ok(delta_layer)
}
}

View File

@@ -74,7 +74,7 @@ impl Layer for RemoteLayer {
_reconstruct_state: &mut ValueReconstructState,
_ctx: &RequestContext,
) -> Result<ValueReconstructResult> {
Err(anyhow::anyhow!("layer {self} needs to be downloaded"))
bail!("layer {self} needs to be downloaded");
}
}

View File

@@ -102,7 +102,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let started_at = Instant::now();
let sleep_duration = if period == Duration::ZERO {
#[cfg(not(feature = "testing"))]
info!("automatic compaction is disabled");
// check again in 10 seconds, in case it's been enabled again.
Duration::from_secs(10)
@@ -167,7 +166,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let gc_horizon = tenant.get_gc_horizon();
let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
#[cfg(not(feature = "testing"))]
info!("automatic GC is disabled");
// check again in 10 seconds, in case it's been enabled again.
Duration::from_secs(10)

View File

@@ -38,6 +38,7 @@ use std::time::{Duration, Instant, SystemTime};
use crate::context::{
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
use crate::tenant::storage_layer::{
@@ -90,7 +91,6 @@ use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::debug_assert_current_span_has_tenant_and_timeline_id;
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::RemoteTimelineClient;
use super::storage_layer::{
@@ -143,6 +143,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: Option<RemoteTimelineClient>,
pub deletion_queue_client: Option<DeletionQueueClient>,
}
pub struct Timeline {
@@ -154,8 +155,7 @@ pub struct Timeline {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
/// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
/// Never changes for the lifetime of this [`Timeline`] object.
// The generation of the tenant that instantiated us: this is used for safety when writing remote objects
generation: Generation,
pub pg_version: u32,
@@ -201,6 +201,9 @@ pub struct Timeline {
/// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
pub remote_client: Option<Arc<RemoteTimelineClient>>,
/// Deletion queue: a global queue, separate to the remote storage queue's
deletion_queue_client: Option<Arc<DeletionQueueClient>>,
// What page versions do we hold in the repository? If we get a
// request > last_record_lsn, we need to wait until we receive all
// the WAL up to the request. The SeqWait provides functions for
@@ -585,7 +588,15 @@ impl Timeline {
Err(e) => {
// don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
drop(_timer);
let walreceiver_status = self.walreceiver_status();
let walreceiver_status = {
match &*self.walreceiver.lock().unwrap() {
None => "stopping or stopped".to_string(),
Some(walreceiver) => match walreceiver.status() {
Some(status) => status.to_human_readable_string(),
None => "Not active".to_string(),
},
}
};
Err(anyhow::Error::new(e).context({
format!(
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
@@ -599,16 +610,6 @@ impl Timeline {
}
}
pub(crate) fn walreceiver_status(&self) -> String {
match &*self.walreceiver.lock().unwrap() {
None => "stopping or stopped".to_string(),
Some(walreceiver) => match walreceiver.status() {
Some(status) => status.to_human_readable_string(),
None => "Not active".to_string(),
},
}
}
/// Check that it is valid to request operations with that lsn.
pub fn check_lsn_is_in_scope(
&self,
@@ -936,48 +937,6 @@ impl Timeline {
self.launch_eviction_task(background_jobs_can_start);
}
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
debug_assert_current_span_has_tenant_and_timeline_id();
// prevent writes to the InMemoryLayer
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_id),
Some(self.timeline_id),
)
.await;
// now all writers to InMemory layer are gone, do the final flush if requested
if freeze_and_flush {
match self.freeze_and_flush().await {
Ok(()) => {}
Err(e) => {
warn!("failed to freeze and flush: {e:#}");
return; // TODO: should probably drain remote timeline client anyways?
}
}
// drain the upload queue
let res = if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
client.wait_completion().await
} else {
Ok(())
};
if let Err(e) = res {
warn!("failed to await for frozen and flushed uploads: {e:#}");
}
}
}
pub fn set_state(&self, new_state: TimelineState) {
match (self.current_state(), new_state) {
(equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
@@ -1312,6 +1271,18 @@ impl Timeline {
Ok(())
}
async fn delete_all_remote(&self) -> anyhow::Result<()> {
if let Some(remote_client) = &self.remote_client {
if let Some(deletion_queue_client) = &self.deletion_queue_client {
remote_client.delete_all(deletion_queue_client).await
} else {
Ok(())
}
} else {
Ok(())
}
}
}
#[derive(Debug, thiserror::Error)]
@@ -1466,6 +1437,7 @@ impl Timeline {
walreceiver: Mutex::new(None),
remote_client: resources.remote_client.map(Arc::new),
deletion_queue_client: resources.deletion_queue_client.map(Arc::new),
// initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
last_record_lsn: SeqWait::new(RecordLsn {
@@ -1726,18 +1698,11 @@ impl Timeline {
for (name, decision) in decided {
let decision = match decision {
Ok(UseRemote { local, remote }) => {
// Remote is authoritative, but we may still choose to retain
// the local file if the contents appear to match
if local.file_size() == remote.file_size() {
// Use the local file, but take the remote metadata so that we pick up
// the correct generation.
UseLocal(remote)
} else {
path.push(name.file_name());
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
path.pop();
UseRemote { local, remote }
}
path.push(name.file_name());
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
path.pop();
UseRemote { local, remote }
}
Ok(decision) => decision,
Err(FutureLayer { local }) => {
@@ -1816,11 +1781,15 @@ impl Timeline {
guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
if let Some(rtc) = self.remote_client.as_ref() {
// Deletion queue client is always Some if remote_client is Some
let deletion_queue_client = self.deletion_queue_client.as_ref().unwrap();
let (needs_upload, needs_cleanup) = to_sync;
for (layer, m) in needs_upload {
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
}
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
rtc.schedule_layer_file_deletion(&needs_cleanup, deletion_queue_client)
.await?;
rtc.schedule_index_upload_for_file_changes()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
@@ -2546,15 +2515,13 @@ impl Timeline {
///
async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
let mut guard = self.layers.write().await;
let layer = guard
.get_layer_for_write(
lsn,
self.get_last_record_lsn(),
self.conf,
self.timeline_id,
self.tenant_id,
)
.await?;
let layer = guard.get_layer_for_write(
lsn,
self.get_last_record_lsn(),
self.conf,
self.timeline_id,
self.tenant_id,
)?;
Ok(layer)
}
@@ -2758,7 +2725,9 @@ impl Timeline {
// update metrics
let sz = l.layer_desc().file_size;
self.metrics.record_new_file_metrics(sz);
self.metrics.resident_physical_size_gauge.add(sz);
self.metrics.num_persistent_files_created.inc_by(1);
self.metrics.persistent_bytes_written.inc_by(sz);
}
guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer);
@@ -2787,7 +2756,6 @@ impl Timeline {
if disk_consistent_lsn != old_disk_consistent_lsn {
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)
.await
.context("update_metadata_file")?;
// Also update the in-memory copy
self.disk_consistent_lsn.store(disk_consistent_lsn);
@@ -2796,7 +2764,7 @@ impl Timeline {
}
/// Update metadata file
async fn update_metadata_file(
fn update_metadata_file(
&self,
disk_consistent_lsn: Lsn,
layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
@@ -2837,9 +2805,14 @@ impl Timeline {
x.unwrap()
));
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
.await
.context("save_metadata")?;
save_metadata(
self.conf,
&self.tenant_id,
&self.timeline_id,
&metadata,
false,
)
.context("save_metadata")?;
if let Some(remote_client) = &self.remote_client {
for (path, layer_metadata) in layer_paths_to_upload {
@@ -3042,8 +3015,7 @@ impl Timeline {
self.tenant_id,
&img_range,
lsn,
)
.await?;
)?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -3079,11 +3051,11 @@ impl Timeline {
}
}
};
image_layer_writer.put_image(key, &img).await?;
image_layer_writer.put_image(key, &img)?;
key = key.next();
}
}
let image_layer = image_layer_writer.finish().await?;
let image_layer = image_layer_writer.finish()?;
image_layers.push(image_layer);
}
}
@@ -3133,8 +3105,9 @@ impl Timeline {
LayerFileMetadata::new(metadata.len(), self.generation),
);
// update metrics
self.metrics.record_new_file_metrics(metadata.len());
self.metrics
.resident_physical_size_gauge
.add(metadata.len());
let l = Arc::new(l);
l.access_stats().record_residence_event(
LayerResidenceStatus::Resident,
@@ -3627,11 +3600,7 @@ impl Timeline {
{
// ... if so, flush previous layer and prepare to write new one
new_layers.push(Arc::new(
writer
.take()
.unwrap()
.finish(prev_key.unwrap().next())
.await?,
writer.take().unwrap().finish(prev_key.unwrap().next())?,
));
writer = None;
@@ -3646,23 +3615,20 @@ impl Timeline {
}
if writer.is_none() {
// Create writer if not initiaized yet
writer = Some(
DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_id,
key,
if dup_end_lsn.is_valid() {
// this is a layer containing slice of values of the same key
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
dup_start_lsn..dup_end_lsn
} else {
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
lsn_range.clone()
},
)
.await?,
);
writer = Some(DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_id,
key,
if dup_end_lsn.is_valid() {
// this is a layer containing slice of values of the same key
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
dup_start_lsn..dup_end_lsn
} else {
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
lsn_range.clone()
},
)?);
}
fail_point!("delta-layer-writer-fail-before-finish", |_| {
@@ -3671,11 +3637,11 @@ impl Timeline {
)))
});
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
writer.as_mut().unwrap().put_value(key, lsn, value)?;
prev_key = Some(key);
}
if let Some(writer) = writer {
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next()).await?));
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
}
// Sync layers
@@ -3816,8 +3782,10 @@ impl Timeline {
)?;
}
// update metrics, including the timeline's physical size
self.metrics.record_new_file_metrics(metadata.len());
// update the timeline's physical size
self.metrics
.resident_physical_size_gauge
.add(metadata.len());
new_layer_paths.insert(
new_delta_path,
@@ -3862,7 +3830,13 @@ impl Timeline {
// Also schedule the deletions in remote storage
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
let deletion_queue = self
.deletion_queue_client
.as_ref()
.ok_or_else(|| anyhow::anyhow!("Remote storage enabled without deletion queue"))?;
remote_client
.schedule_layer_file_deletion(&layer_names_to_delete, deletion_queue)
.await?;
}
Ok(())
@@ -4175,8 +4149,7 @@ impl Timeline {
if !layers_to_remove.is_empty() {
// Persist the new GC cutoff value in the metadata file, before
// we actually remove anything.
self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())
.await?;
self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
// Actually delete the layers from disk and remove them from the map.
// (couldn't do this in the loop above, because you cannot modify a collection
@@ -4197,7 +4170,15 @@ impl Timeline {
}
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
// Remote metadata upload was scheduled in `update_metadata_file`: wait
// for completion before scheduling any deletions.
remote_client.wait_completion().await?;
let deletion_queue = self.deletion_queue_client.as_ref().ok_or_else(|| {
anyhow::anyhow!("Remote storage enabled without deletion queue")
})?;
remote_client
.schedule_layer_file_deletion(&layer_names_to_delete, deletion_queue)
.await?;
}
apply.flush();
@@ -4787,6 +4768,7 @@ mod tests {
use utils::{id::TimelineId, lsn::Lsn};
use crate::deletion_queue::mock::MockDeletionQueue;
use crate::tenant::{harness::TenantHarness, storage_layer::PersistentLayer};
use super::{EvictionError, Timeline};
@@ -4796,8 +4778,30 @@ mod tests {
let harness =
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
let remote_storage = {
// this is never used for anything, because of how the create_test_timeline works, but
// it is with us in spirit and a Some.
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
let path = harness.conf.workdir.join("localfs");
std::fs::create_dir_all(&path).unwrap();
let config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(path),
};
GenericRemoteStorage::from_config(&config).unwrap()
};
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()), harness.conf);
let ctx = any_context();
let tenant = harness.try_load(&ctx).await.unwrap();
let tenant = harness
.try_load(
&ctx,
Some(remote_storage),
Some(deletion_queue.new_client()),
)
.await
.unwrap();
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
@@ -4847,8 +4851,30 @@ mod tests {
async fn layer_eviction_aba_fails() {
let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap();
let remote_storage = {
// this is never used for anything, because of how the create_test_timeline works, but
// it is with us in spirit and a Some.
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
let path = harness.conf.workdir.join("localfs");
std::fs::create_dir_all(&path).unwrap();
let config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(path),
};
GenericRemoteStorage::from_config(&config).unwrap()
};
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()), harness.conf);
let ctx = any_context();
let tenant = harness.try_load(&ctx).await.unwrap();
let tenant = harness
.try_load(
&ctx,
Some(remote_storage),
Some(deletion_queue.new_client()),
)
.await
.unwrap();
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await

View File

@@ -14,6 +14,7 @@ use utils::{
use crate::{
config::PageServerConf,
deletion_queue::DeletionQueueClient,
task_mgr::{self, TaskKind},
tenant::{
metadata::TimelineMetadata,
@@ -238,15 +239,6 @@ async fn delete_local_layer_files(
Ok(())
}
/// Removes remote layers and an index file after them.
async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
if let Some(remote_client) = &timeline.remote_client {
remote_client.delete_all().await.context("delete_all")?
};
Ok(())
}
// This function removs remaining traces of a timeline on disk.
// Namely: metadata file, timeline directory, delete mark.
// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
@@ -407,6 +399,7 @@ impl DeleteTimelineFlow {
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: Option<RemoteTimelineClient>,
deletion_queue_client: Option<DeletionQueueClient>,
init_order: Option<&InitializationOrder>,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -416,7 +409,10 @@ impl DeleteTimelineFlow {
timeline_id,
local_metadata,
None, // Ancestor is not needed for deletion.
TimelineResources { remote_client },
TimelineResources {
remote_client,
deletion_queue_client,
},
init_order,
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.
@@ -559,7 +555,7 @@ impl DeleteTimelineFlow {
) -> Result<(), DeleteTimelineError> {
delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
delete_remote_layers_and_index(timeline).await?;
timeline.delete_all_remote().await?;
pausable_failpoint!("in_progress_delete");

View File

@@ -328,24 +328,9 @@ impl Timeline {
// Make one of the tenant's timelines draw the short straw and run the calculation.
// The others wait until the calculation is done so that they take into account the
// imitated accesses that the winner made.
//
// It is critical we are responsive to cancellation here. Otherwise, we deadlock with
// tenant deletion (holds TENANTS in read mode) any other task that attempts to
// acquire TENANTS in write mode before we here call get_tenant.
// See https://github.com/neondatabase/neon/issues/5284.
let res = tokio::select! {
_ = cancel.cancelled() => {
return ControlFlow::Break(());
}
res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
res
}
};
let tenant = match res {
Ok(t) => t,
Err(_) => {
return ControlFlow::Break(());
}
let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else {
// likely, we're shutting down
return ControlFlow::Break(());
};
let mut state = tenant.eviction_task_tenant_state.lock().await;
match state.last_layer_access_imitation {

View File

@@ -147,7 +147,11 @@ pub(super) fn reconcile(
Err(FutureLayer { local })
} else {
Ok(match (local, remote) {
(Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
(Some(local), Some(remote)) if local != remote => {
assert_eq!(local.generation, remote.generation);
UseRemote { local, remote }
}
(Some(x), Some(_)) => UseLocal(x),
(None, Some(x)) => Evicted(x),
(Some(x), None) => NeedsUpload(x),

View File

@@ -87,7 +87,7 @@ impl LayerManager {
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
/// called within `get_layer_for_write`.
pub(crate) async fn get_layer_for_write(
pub(crate) fn get_layer_for_write(
&mut self,
lsn: Lsn,
last_record_lsn: Lsn,
@@ -129,7 +129,7 @@ impl LayerManager {
lsn
);
let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?;
let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn)?;
let layer = Arc::new(new_layer);
self.layer_map.open_layer = Some(layer.clone());

View File

@@ -135,7 +135,7 @@ impl WalReceiver {
.await;
}
pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
pub(super) fn status(&self) -> Option<ConnectionManagerStatus> {
self.manager_status.read().unwrap().clone()
}
}

View File

@@ -1,7 +1,4 @@
use crate::metrics::RemoteOpFileKind;
use super::storage_layer::LayerFileName;
use super::Generation;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -63,7 +60,6 @@ pub(crate) struct UploadQueueInitialized {
// Breakdown of different kinds of tasks currently in-progress
pub(crate) num_inprogress_layer_uploads: usize,
pub(crate) num_inprogress_metadata_uploads: usize,
pub(crate) num_inprogress_deletions: usize,
/// Tasks that are currently in-progress. In-progress means that a tokio Task
/// has been launched for it. An in-progress task can be busy uploading, but it can
@@ -121,7 +117,6 @@ impl UploadQueue {
task_counter: 0,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::new(),
queued_operations: VecDeque::new(),
};
@@ -163,7 +158,6 @@ impl UploadQueue {
task_counter: 0,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::new(),
queued_operations: VecDeque::new(),
};
@@ -201,14 +195,6 @@ pub(crate) struct UploadTask {
pub(crate) op: UploadOp,
}
#[derive(Debug)]
pub(crate) struct Delete {
pub(crate) file_kind: RemoteOpFileKind,
pub(crate) layer_file_name: LayerFileName,
pub(crate) scheduled_from_timeline_delete: bool,
pub(crate) generation: Generation,
}
#[derive(Debug)]
pub(crate) enum UploadOp {
/// Upload a layer file
@@ -217,9 +203,6 @@ pub(crate) enum UploadOp {
/// Upload the metadata file
UploadMetadata(IndexPart, Lsn),
/// Delete a layer file
Delete(Delete),
/// Barrier. When the barrier operation is reached,
Barrier(tokio::sync::watch::Sender<()>),
}
@@ -230,22 +213,14 @@ impl std::fmt::Display for UploadOp {
UploadOp::UploadLayer(path, metadata) => {
write!(
f,
"UploadLayer({}, size={:?}, gen={:?})",
"UploadLayer({}, size={:?})",
path.file_name(),
metadata.file_size(),
metadata.generation,
metadata.file_size()
)
}
UploadOp::UploadMetadata(_, lsn) => {
write!(f, "UploadMetadata(lsn: {})", lsn)
}
UploadOp::Delete(delete) => write!(
f,
"Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
delete.layer_file_name.file_name(),
delete.scheduled_from_timeline_delete,
delete.generation
),
UploadOp::Barrier(_) => write!(f, "Barrier"),
}
}

1
pageserver/src/test.log Normal file
View File

@@ -0,0 +1 @@
-bash: scripts/pytest: No such file or directory

Some files were not shown because too many files have changed in this diff Show More