mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-25 22:30:38 +00:00
Compare commits
91 Commits
extension_
...
al/support
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
12f0c0ec8f | ||
|
|
13bd44a1f0 | ||
|
|
cda148d40d | ||
|
|
1aad8918e1 | ||
|
|
966213f429 | ||
|
|
35e73759f5 | ||
|
|
48936d44f8 | ||
|
|
2eae0a1fe5 | ||
|
|
53470ad12a | ||
|
|
edccef4514 | ||
|
|
982fce1e72 | ||
|
|
e767ced8d0 | ||
|
|
1309571f5d | ||
|
|
9a69b6cb94 | ||
|
|
cc82cd1b07 | ||
|
|
c76b74c50d | ||
|
|
ed938885ff | ||
|
|
db4d094afa | ||
|
|
0626e0bfd3 | ||
|
|
444d6e337f | ||
|
|
3a1be9b246 | ||
|
|
664d32eb7f | ||
|
|
ed845b644b | ||
|
|
87dd37a2f2 | ||
|
|
1355bd0ac5 | ||
|
|
a1d6b1a4af | ||
|
|
92aee7e07f | ||
|
|
5e2f29491f | ||
|
|
618d36ee6d | ||
|
|
33c2d94ba6 | ||
|
|
08bfe1c826 | ||
|
|
65ff256bb8 | ||
|
|
5177c1e4b1 | ||
|
|
49efcc3773 | ||
|
|
76b1cdc17e | ||
|
|
1f151d03d8 | ||
|
|
ac758e4f51 | ||
|
|
4f280c2953 | ||
|
|
20137d9588 | ||
|
|
634be4f4e0 | ||
|
|
d340cf3721 | ||
|
|
1741edf933 | ||
|
|
269e20aeab | ||
|
|
91435006bd | ||
|
|
b263510866 | ||
|
|
e418fc6dc3 | ||
|
|
434eaadbe3 | ||
|
|
6fb7edf494 | ||
|
|
505aa242ac | ||
|
|
1c516906e7 | ||
|
|
7d7cd8375c | ||
|
|
c92b7543b5 | ||
|
|
dbf88cf2d7 | ||
|
|
f1db87ac36 | ||
|
|
3f9defbfb4 | ||
|
|
c7143dbde6 | ||
|
|
cbf9a40889 | ||
|
|
10aba174c9 | ||
|
|
ab2ea8cfa5 | ||
|
|
9c8c55e819 | ||
|
|
10110bee69 | ||
|
|
cff7ae0b0d | ||
|
|
78a7f68902 | ||
|
|
24eaa3b7ca | ||
|
|
26828560a8 | ||
|
|
86604b3b7d | ||
|
|
4957bb2d48 | ||
|
|
ff1a1aea86 | ||
|
|
c9f05d418d | ||
|
|
9de1a6fb14 | ||
|
|
fbd37740c5 | ||
|
|
3e55d9dec6 | ||
|
|
f558f88a08 | ||
|
|
b990200496 | ||
|
|
7e20b49da4 | ||
|
|
032b603011 | ||
|
|
ca0e0781c8 | ||
|
|
b2a5e91a88 | ||
|
|
44e7d5132f | ||
|
|
c19681bc12 | ||
|
|
ec9b585837 | ||
|
|
02ef246db6 | ||
|
|
195d4932c6 | ||
|
|
7fe0a4bf1a | ||
|
|
ef2b9ffbcb | ||
|
|
250a27fb85 | ||
|
|
d748615c1f | ||
|
|
681c6910c2 | ||
|
|
148f0f9b21 | ||
|
|
a7f3f5f356 | ||
|
|
00d1cfa503 |
@@ -12,6 +12,11 @@ opt-level = 3
|
||||
# Turn on a small amount of optimization in Development mode.
|
||||
opt-level = 1
|
||||
|
||||
[build]
|
||||
# This is only present for local builds, as it will be overridden
|
||||
# by the RUSTDOCFLAGS env var in CI.
|
||||
rustdocflags = ["-Arustdoc::private_intra_doc_links"]
|
||||
|
||||
[alias]
|
||||
build_testing = ["build", "--features", "testing"]
|
||||
neon = ["run", "--bin", "neon_local"]
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
!trace/
|
||||
!vendor/postgres-v14/
|
||||
!vendor/postgres-v15/
|
||||
!vendor/postgres-v16/
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
|
||||
@@ -105,7 +105,7 @@ runs:
|
||||
# Get previously uploaded data for this run
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[].Key')
|
||||
S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[]?.Key')
|
||||
if [ -z "$S3_FILEPATHS" ]; then
|
||||
# There's no previously uploaded data for this $GITHUB_RUN_ID
|
||||
exit 0
|
||||
|
||||
55
.github/workflows/approved-for-ci-run.yml
vendored
Normal file
55
.github/workflows/approved-for-ci-run.yml
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
name: Handle `approved-for-ci-run` label
|
||||
# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
# Default types that triggers a workflow ([1]):
|
||||
# - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
|
||||
- opened
|
||||
- synchronize
|
||||
- reopened
|
||||
# Types that we wand to handle in addition to keep labels tidy:
|
||||
- closed
|
||||
# Actual magic happens here:
|
||||
- labeled
|
||||
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
jobs:
|
||||
remove-label:
|
||||
# Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
|
||||
# The PR should be reviewed and labelled manually again.
|
||||
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
if: |
|
||||
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
create-branch:
|
||||
# Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
|
||||
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
if: |
|
||||
github.event.action == 'labeled' &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: main
|
||||
|
||||
- run: gh pr checkout "${PR_NUMBER}"
|
||||
|
||||
- run: git checkout -b "ci-run/pr-${PR_NUMBER}"
|
||||
|
||||
- run: git push --force origin "ci-run/pr-${PR_NUMBER}"
|
||||
149
.github/workflows/build_and_test.yml
vendored
149
.github/workflows/build_and_test.yml
vendored
@@ -5,6 +5,7 @@ on:
|
||||
branches:
|
||||
- main
|
||||
- release
|
||||
- ci-run/pr-*
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
@@ -127,6 +128,11 @@ jobs:
|
||||
- name: Run cargo clippy (release)
|
||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
@@ -155,7 +161,7 @@ jobs:
|
||||
build_type: [ debug, release ]
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build_type }}
|
||||
GIT_VERSION: ${{ github.sha }}
|
||||
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
@@ -174,6 +180,27 @@ jobs:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Check Postgres submodules revision
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
# This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
|
||||
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
|
||||
|
||||
FAILED=false
|
||||
for postgres in postgres-v14 postgres-v15; do
|
||||
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
|
||||
actual=$(git rev-parse "HEAD:vendor/${postgres}")
|
||||
if [ "${expected}" != "${actual}" ]; then
|
||||
echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
|
||||
FAILED=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "${FAILED}" = "true" ]; then
|
||||
echo >&2 "Please update vendors/revisions.json if these changes are intentional"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
@@ -614,7 +641,7 @@ jobs:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
@@ -658,7 +685,7 @@ jobs:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
@@ -715,13 +742,42 @@ jobs:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--cleanup
|
||||
|
||||
# Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
|
||||
# During the transition period we need to have extensions in both places (in S3 and in compute-node image),
|
||||
# so we won't build extension twice, but extract them from compute-node.
|
||||
#
|
||||
# For now we use extensions image only for new custom extensitons
|
||||
- name: Kaniko build extensions only
|
||||
run: |
|
||||
# Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
|
||||
# Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
|
||||
# it still fails with error:
|
||||
# error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
|
||||
#
|
||||
# Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
|
||||
find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
|
||||
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--context . \
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
|
||||
--build-arg PG_VERSION=${{ matrix.version }} \
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
|
||||
--dockerfile Dockerfile.compute-node \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
--destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
--cleanup \
|
||||
--target postgres-extensions
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
@@ -738,7 +794,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.8.0
|
||||
VM_BUILDER_VERSION: v0.13.1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -840,8 +896,10 @@ jobs:
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Push images to production ECR
|
||||
if: |
|
||||
@@ -852,8 +910,10 @@ jobs:
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest
|
||||
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
@@ -875,16 +935,93 @@ jobs:
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
upload-postgres-extensions-to-s3:
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
|
||||
needs: [ tag, promote-images ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15 ]
|
||||
|
||||
env:
|
||||
# While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
|
||||
# Later all the extensions will be moved to extensions image.
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
|
||||
COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
|
||||
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
|
||||
S3_BUCKETS: |
|
||||
${{ github.ref_name == 'release' &&
|
||||
'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
|
||||
'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
|
||||
|
||||
steps:
|
||||
- name: Pull postgres-extensions image
|
||||
run: |
|
||||
docker pull ${EXTENSIONS_IMAGE}
|
||||
docker pull ${COMPUTE_NODE_IMAGE}
|
||||
|
||||
- name: Create postgres-extensions container
|
||||
id: create-container
|
||||
run: |
|
||||
EID=$(docker create ${EXTENSIONS_IMAGE} true)
|
||||
echo "EID=${EID}" >> $GITHUB_OUTPUT
|
||||
|
||||
CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
|
||||
echo "CID=${CID}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Extract postgres-extensions from container
|
||||
run: |
|
||||
rm -rf ./extensions-to-upload ./custom-extensions # Just in case
|
||||
|
||||
# In compute image we have a bit different directory layout
|
||||
mkdir -p extensions-to-upload/share
|
||||
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
|
||||
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib ./extensions-to-upload/lib
|
||||
|
||||
# Delete Neon extensitons (they always present on compute-node image)
|
||||
rm -rf ./extensions-to-upload/share/extension/neon*
|
||||
rm -rf ./extensions-to-upload/lib/neon*
|
||||
|
||||
# Delete leftovers from the extension build step
|
||||
rm -rf ./extensions-to-upload/lib/pgxs
|
||||
rm -rf ./extensions-to-upload/lib/pkgconfig
|
||||
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
|
||||
for EXT_NAME in $(ls ./custom-extensions); do
|
||||
mkdir -p ./extensions-to-upload/${EXT_NAME}/share
|
||||
|
||||
mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
|
||||
mv ./custom-extensions/${EXT_NAME}/lib ./extensions-to-upload/${EXT_NAME}/lib
|
||||
done
|
||||
|
||||
- name: Upload postgres-extensions to S3
|
||||
run: |
|
||||
for BUCKET in $(echo ${S3_BUCKETS}); do
|
||||
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
|
||||
done
|
||||
|
||||
- name: Cleanup
|
||||
if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
|
||||
run: |
|
||||
docker rm ${{ steps.create-container.outputs.CID }} || true
|
||||
docker rm ${{ steps.create-container.outputs.EID }} || true
|
||||
|
||||
deploy:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
needs: [ promote-images, tag, regress-tests ]
|
||||
needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
|
||||
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
|
||||
3
.github/workflows/neon_extra_builds.yml
vendored
3
.github/workflows/neon_extra_builds.yml
vendored
@@ -3,7 +3,8 @@ name: Check neon with extra platform builds
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- main
|
||||
- ci-run/pr-*
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
|
||||
4
.gitmodules
vendored
4
.gitmodules
vendored
@@ -6,3 +6,7 @@
|
||||
path = vendor/postgres-v15
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
branch = REL_15_STABLE_neon
|
||||
[submodule "vendor/postgres-v16"]
|
||||
path = vendor/postgres-v16
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
branch = REL_16_STABLE_neon
|
||||
|
||||
311
Cargo.lock
generated
311
Cargo.lock
generated
@@ -158,6 +158,19 @@ dependencies = [
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-compression"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
|
||||
dependencies = [
|
||||
"flate2",
|
||||
"futures-core",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-stream"
|
||||
version = "0.3.5"
|
||||
@@ -200,17 +213,6 @@ dependencies = [
|
||||
"critical-section",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
|
||||
dependencies = [
|
||||
"hermit-abi 0.1.19",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
@@ -604,7 +606,7 @@ dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"miniz_oxide",
|
||||
"miniz_oxide 0.6.2",
|
||||
"object",
|
||||
"rustc-demangle",
|
||||
]
|
||||
@@ -805,18 +807,6 @@ dependencies = [
|
||||
"libloading",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "3.2.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"clap_lex 0.2.4",
|
||||
"indexmap",
|
||||
"textwrap",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.3.0"
|
||||
@@ -837,7 +827,7 @@ dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"bitflags",
|
||||
"clap_lex 0.5.0",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
]
|
||||
|
||||
@@ -853,15 +843,6 @@ dependencies = [
|
||||
"syn 2.0.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
|
||||
dependencies = [
|
||||
"os_str_bytes",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.5.0"
|
||||
@@ -914,9 +895,11 @@ name = "compute_tools"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"chrono",
|
||||
"clap 4.3.0",
|
||||
"clap",
|
||||
"compute_api",
|
||||
"flate2",
|
||||
"futures",
|
||||
"hyper",
|
||||
"notify",
|
||||
@@ -924,14 +907,12 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"postgres",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tar",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
@@ -979,7 +960,7 @@ name = "control_plane"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap 4.3.0",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"git-version",
|
||||
@@ -999,7 +980,6 @@ dependencies = [
|
||||
"tar",
|
||||
"thiserror",
|
||||
"toml",
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -1050,19 +1030,19 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "criterion"
|
||||
version = "0.4.0"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb"
|
||||
checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
|
||||
dependencies = [
|
||||
"anes",
|
||||
"atty",
|
||||
"cast",
|
||||
"ciborium",
|
||||
"clap 3.2.25",
|
||||
"clap",
|
||||
"criterion-plot",
|
||||
"is-terminal",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"oorandom",
|
||||
"plotters",
|
||||
"rayon",
|
||||
@@ -1143,7 +1123,7 @@ dependencies = [
|
||||
"crossterm_winapi",
|
||||
"libc",
|
||||
"mio",
|
||||
"parking_lot",
|
||||
"parking_lot 0.12.1",
|
||||
"signal-hook",
|
||||
"signal-hook-mio",
|
||||
"winapi",
|
||||
@@ -1213,7 +1193,7 @@ dependencies = [
|
||||
"hashbrown 0.12.3",
|
||||
"lock_api",
|
||||
"once_cell",
|
||||
"parking_lot_core",
|
||||
"parking_lot_core 0.9.7",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1402,6 +1382,16 @@ version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.0.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide 0.7.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
@@ -1679,15 +1669,6 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.2.6"
|
||||
@@ -1942,6 +1923,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2192,6 +2176,15 @@ dependencies = [
|
||||
"adler",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
|
||||
dependencies = [
|
||||
"adler",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mio"
|
||||
version = "0.8.6"
|
||||
@@ -2270,16 +2263,6 @@ dependencies = [
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
||||
dependencies = [
|
||||
"overload",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.4.3"
|
||||
@@ -2396,9 +2379,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
|
||||
checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
|
||||
dependencies = [
|
||||
"opentelemetry_api",
|
||||
"opentelemetry_sdk",
|
||||
@@ -2406,9 +2389,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-http"
|
||||
version = "0.7.0"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
|
||||
checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -2419,9 +2402,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.11.0"
|
||||
version = "0.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
|
||||
checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures",
|
||||
@@ -2437,48 +2420,47 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-proto"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
|
||||
checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"futures-util",
|
||||
"opentelemetry",
|
||||
"prost",
|
||||
"tonic 0.8.3",
|
||||
"tonic-build 0.8.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-semantic-conventions"
|
||||
version = "0.10.0"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
|
||||
checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_api"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
|
||||
checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"indexmap",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
"urlencoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
|
||||
checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"crossbeam-channel",
|
||||
@@ -2507,31 +2489,19 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "os_str_bytes"
|
||||
version = "6.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
|
||||
|
||||
[[package]]
|
||||
name = "outref"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "pagectl"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"clap 4.3.0",
|
||||
"clap",
|
||||
"git-version",
|
||||
"pageserver",
|
||||
"postgres_ffi",
|
||||
@@ -2545,12 +2515,13 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.3.0",
|
||||
"clap",
|
||||
"close_fds",
|
||||
"const_format",
|
||||
"consumption_metrics",
|
||||
@@ -2561,6 +2532,7 @@ dependencies = [
|
||||
"enum-map",
|
||||
"enumset",
|
||||
"fail",
|
||||
"flate2",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
@@ -2632,6 +2604,17 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
|
||||
dependencies = [
|
||||
"instant",
|
||||
"lock_api",
|
||||
"parking_lot_core 0.8.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.1"
|
||||
@@ -2639,7 +2622,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
"parking_lot_core",
|
||||
"parking_lot_core 0.9.7",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"instant",
|
||||
"libc",
|
||||
"redox_syscall 0.2.16",
|
||||
"smallvec",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2655,6 +2652,16 @@ dependencies = [
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pbkdf2"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"hmac",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
@@ -2929,9 +2936,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.58"
|
||||
version = "1.0.64"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
|
||||
checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
@@ -2960,7 +2967,7 @@ dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"memchr",
|
||||
"parking_lot",
|
||||
"parking_lot 0.12.1",
|
||||
"procfs",
|
||||
"thiserror",
|
||||
]
|
||||
@@ -3025,12 +3032,11 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"atty",
|
||||
"base64 0.13.1",
|
||||
"bstr",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.3.0",
|
||||
"clap",
|
||||
"consumption_metrics",
|
||||
"futures",
|
||||
"git-version",
|
||||
@@ -3048,7 +3054,8 @@ dependencies = [
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"parking_lot",
|
||||
"parking_lot 0.12.1",
|
||||
"pbkdf2",
|
||||
"pin-project-lite",
|
||||
"postgres-native-tls",
|
||||
"postgres_backend",
|
||||
@@ -3059,6 +3066,7 @@ dependencies = [
|
||||
"regex",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
"reqwest-retry",
|
||||
"reqwest-tracing",
|
||||
"routerify",
|
||||
"rstest",
|
||||
@@ -3295,10 +3303,33 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.4.4"
|
||||
name = "reqwest-retry"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
|
||||
checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"chrono",
|
||||
"futures",
|
||||
"getrandom",
|
||||
"http",
|
||||
"hyper",
|
||||
"parking_lot 0.11.2",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
"retry-policies",
|
||||
"task-local-extensions",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"wasm-timer",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -3312,6 +3343,17 @@ dependencies = [
|
||||
"tracing-opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "retry-policies"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"rand",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.16.20"
|
||||
@@ -3510,7 +3552,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.3.0",
|
||||
"clap",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"fs2",
|
||||
@@ -3521,7 +3563,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot",
|
||||
"parking_lot 0.12.1",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
@@ -3940,7 +3982,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"bytes",
|
||||
"clap 4.3.0",
|
||||
"clap",
|
||||
"const_format",
|
||||
"futures",
|
||||
"futures-core",
|
||||
@@ -3950,12 +3992,12 @@ dependencies = [
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot",
|
||||
"parking_lot 0.12.1",
|
||||
"prost",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tonic 0.9.2",
|
||||
"tonic-build 0.9.2",
|
||||
"tonic-build",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -4121,12 +4163,6 @@ dependencies = [
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textwrap"
|
||||
version = "0.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.40"
|
||||
@@ -4284,7 +4320,7 @@ dependencies = [
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"log",
|
||||
"parking_lot",
|
||||
"parking_lot 0.12.1",
|
||||
"percent-encoding",
|
||||
"phf",
|
||||
"pin-project-lite",
|
||||
@@ -4479,19 +4515,6 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic-build"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
|
||||
dependencies = [
|
||||
"prettyplease 0.1.25",
|
||||
"proc-macro2",
|
||||
"prost-build",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic-build"
|
||||
version = "0.9.2"
|
||||
@@ -4542,7 +4565,7 @@ name = "trace"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap 4.3.0",
|
||||
"clap",
|
||||
"pageserver_api",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -4615,9 +4638,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
|
||||
checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
@@ -4644,7 +4667,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
@@ -4813,11 +4835,11 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"atty",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"const_format",
|
||||
"criterion",
|
||||
"futures",
|
||||
"heapless",
|
||||
@@ -4890,7 +4912,7 @@ name = "wal_craft"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap 4.3.0",
|
||||
"clap",
|
||||
"env_logger",
|
||||
"log",
|
||||
"once_cell",
|
||||
@@ -4994,6 +5016,21 @@ version = "0.2.86"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-timer"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be0ecb0db480561e9a7642b5d3e4187c128914e58aa84330b9493e3eb68c5e7f"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"js-sys",
|
||||
"parking_lot 0.11.2",
|
||||
"pin-utils",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.63"
|
||||
@@ -5255,7 +5292,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.3.0",
|
||||
"clap",
|
||||
"clap_builder",
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
|
||||
19
Cargo.toml
19
Cargo.toml
@@ -32,9 +32,10 @@ license = "Apache-2.0"
|
||||
## All dependency versions, used in the project
|
||||
[workspace.dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
atty = "0.2.14"
|
||||
aws-config = { version = "0.55", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "0.27"
|
||||
aws-smithy-http = "0.55"
|
||||
@@ -83,18 +84,20 @@ notify = "5.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.18.0"
|
||||
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.10.0"
|
||||
opentelemetry = "0.19.0"
|
||||
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.11.0"
|
||||
parking_lot = "0.12"
|
||||
pbkdf2 = "0.12.1"
|
||||
pin-project-lite = "0.2"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
regex = "1.4"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
||||
reqwest-middleware = "0.2.0"
|
||||
reqwest-retry = "0.2.2"
|
||||
routerify = "3"
|
||||
rpds = "0.13"
|
||||
rustls = "0.20"
|
||||
@@ -127,8 +130,8 @@ toml_edit = "0.19"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.18.0"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
tracing-opentelemetry = "0.19.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
@@ -170,7 +173,7 @@ utils = { version = "0.1", path = "./libs/utils/" }
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
criterion = "0.4"
|
||||
criterion = "0.5.1"
|
||||
rcgen = "0.10"
|
||||
rstest = "0.17"
|
||||
tempfile = "3.4"
|
||||
|
||||
@@ -12,6 +12,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
|
||||
COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
|
||||
COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
|
||||
COPY --chown=nonroot pgxn pgxn
|
||||
COPY --chown=nonroot Makefile Makefile
|
||||
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
|
||||
@@ -39,6 +40,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
@@ -79,6 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
|
||||
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
|
||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||
|
||||
@@ -132,10 +132,20 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.ta
|
||||
FROM build-deps AS h3-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# packaged cmake is too old
|
||||
RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
|
||||
RUN case "$(uname -m)" in \
|
||||
"x86_64") \
|
||||
export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
|
||||
;; \
|
||||
"aarch64") \
|
||||
export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
|
||||
;; \
|
||||
*) \
|
||||
echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
|
||||
-q -O /tmp/cmake-install.sh \
|
||||
&& echo "739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 /tmp/cmake-install.sh" | sha256sum --check \
|
||||
&& echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
|
||||
&& chmod u+x /tmp/cmake-install.sh \
|
||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||
&& rm /tmp/cmake-install.sh
|
||||
@@ -189,8 +199,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
|
||||
echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
@@ -498,6 +508,62 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-roaringbitmap-pg-build"
|
||||
# compile pg_roaringbitmap extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-roaringbitmap-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
|
||||
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-embedding-pg-build"
|
||||
# compile pg_embedding extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
|
||||
# There is no release tag yet
|
||||
RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-anon-pg-build"
|
||||
# compile anon extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-anon-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sort > /before.txt && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
|
||||
find /usr/local/pgsql -type f | sort > /after.txt && \
|
||||
/bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
@@ -606,6 +672,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
# Public extensions
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=postgis-build /sfcgal/* /
|
||||
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -632,6 +699,8 @@ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -680,6 +749,22 @@ RUN rm -r /usr/local/pgsql/include
|
||||
# if they were to be used by other libraries.
|
||||
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Extenstion only
|
||||
#
|
||||
#########################################################################################
|
||||
FROM scratch AS postgres-extensions
|
||||
# After the transition this layer will include all extensitons.
|
||||
# As for now, it's only for new custom ones
|
||||
#
|
||||
# # Default extensions
|
||||
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
|
||||
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib /usr/local/pgsql/lib
|
||||
# Custom extensions
|
||||
COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
|
||||
COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
|
||||
17
Makefile
17
Makefile
@@ -83,6 +83,8 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
||||
# I'm not sure why it wouldn't work, but this is the only place (apart from
|
||||
# the "build-all-versions" entry points) where direct mention of PostgreSQL
|
||||
# versions is used.
|
||||
.PHONY: postgres-configure-v16
|
||||
postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
|
||||
.PHONY: postgres-configure-v15
|
||||
postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
|
||||
.PHONY: postgres-configure-v14
|
||||
@@ -165,28 +167,33 @@ neon-pg-ext-clean-%:
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
neon-pg-ext-v14 \
|
||||
neon-pg-ext-v15
|
||||
neon-pg-ext-v15 \
|
||||
neon-pg-ext-v16
|
||||
|
||||
.PHONY: neon-pg-ext-clean
|
||||
neon-pg-ext-clean: \
|
||||
neon-pg-ext-clean-v14 \
|
||||
neon-pg-ext-clean-v15
|
||||
neon-pg-ext-clean-v15 \
|
||||
neon-pg-ext-clean-v16
|
||||
|
||||
# shorthand to build all Postgres versions
|
||||
.PHONY: postgres
|
||||
postgres: \
|
||||
postgres-v14 \
|
||||
postgres-v15
|
||||
postgres-v15 \
|
||||
postgres-v16
|
||||
|
||||
.PHONY: postgres-headers
|
||||
postgres-headers: \
|
||||
postgres-headers-v14 \
|
||||
postgres-headers-v15
|
||||
postgres-headers-v15 \
|
||||
postgres-headers-v16
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean: \
|
||||
postgres-clean-v14 \
|
||||
postgres-clean-v15
|
||||
postgres-clean-v15 \
|
||||
postgres-clean-v16
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
|
||||
16
README.md
16
README.md
@@ -132,13 +132,13 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
|
||||
# Create repository in .neon with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
> cargo neon init
|
||||
Starting pageserver at '127.0.0.1:64000' in '.neon'.
|
||||
Initializing pageserver node 1 at '127.0.0.1:64000' in ".neon"
|
||||
|
||||
# start pageserver, safekeeper, and broker for their intercommunication
|
||||
> cargo neon start
|
||||
Starting neon broker at 127.0.0.1:50051
|
||||
Starting neon broker at 127.0.0.1:50051.
|
||||
storage_broker started, pid: 2918372
|
||||
Starting pageserver at '127.0.0.1:64000' in '.neon'.
|
||||
Starting pageserver node 1 at '127.0.0.1:64000' in ".neon".
|
||||
pageserver started, pid: 2918386
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
|
||||
safekeeper 1 started, pid: 2918437
|
||||
@@ -152,8 +152,7 @@ Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
|
||||
# start postgres compute node
|
||||
> cargo neon endpoint start main
|
||||
Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
|
||||
Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
|
||||
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
|
||||
|
||||
# check list of running postgres instances
|
||||
> cargo neon endpoint list
|
||||
@@ -189,18 +188,17 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
|
||||
# start postgres on that branch
|
||||
> cargo neon endpoint start migration_check --branch-name migration_check
|
||||
Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
||||
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
|
||||
Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
|
||||
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
|
||||
|
||||
# check the new list of running postgres instances
|
||||
> cargo neon endpoint list
|
||||
ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running
|
||||
migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running
|
||||
migration_check 127.0.0.1:55434 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running
|
||||
|
||||
# this new postgres instance will have all the data from 'main' postgres,
|
||||
# but all modifications would not affect data in original postgres
|
||||
> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres
|
||||
> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
|
||||
@@ -6,8 +6,10 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
notify.workspace = true
|
||||
@@ -30,5 +32,3 @@ url.workspace = true
|
||||
compute_api.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
|
||||
@@ -27,8 +27,7 @@
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r {"bucket": "my-bucket", "region": "eu-central-1"}
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
@@ -36,7 +35,7 @@ use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -49,7 +48,6 @@ use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
@@ -66,20 +64,6 @@ fn main() -> Result<()> {
|
||||
info!("build_tag: {build_tag}");
|
||||
|
||||
let matches = cli().get_matches();
|
||||
let pgbin_default = String::from("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||
|
||||
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
|
||||
let ext_remote_storage = match remote_ext_config {
|
||||
Some(x) => match init_remote_storage(x) {
|
||||
Ok(y) => Some(y),
|
||||
Err(e) => {
|
||||
dbg!("Error {:?}, setting remote storage to None", e);
|
||||
None
|
||||
}
|
||||
},
|
||||
None => None,
|
||||
};
|
||||
|
||||
let http_port = *matches
|
||||
.get_one::<u16>("http-port")
|
||||
@@ -144,6 +128,9 @@ fn main() -> Result<()> {
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap();
|
||||
|
||||
let spec;
|
||||
let mut live_config_allowed = false;
|
||||
match spec_json {
|
||||
@@ -181,7 +168,6 @@ fn main() -> Result<()> {
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
let spec_set;
|
||||
|
||||
if let Some(spec) = spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
@@ -193,13 +179,9 @@ fn main() -> Result<()> {
|
||||
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
pgversion: get_pg_version(pgbin),
|
||||
live_config_allowed,
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_remote_storage,
|
||||
available_libraries: OnceLock::new(),
|
||||
available_extensions: OnceLock::new(),
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
@@ -208,8 +190,6 @@ fn main() -> Result<()> {
|
||||
let _http_handle =
|
||||
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
@@ -250,7 +230,7 @@ fn main() -> Result<()> {
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
let mut exit_code = None;
|
||||
let pg = match compute.start_compute(extension_server_port) {
|
||||
let pg = match compute.start_compute() {
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:?}", err);
|
||||
@@ -276,6 +256,16 @@ fn main() -> Result<()> {
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
|
||||
// Maybe sync safekeepers again, to speed up next startup
|
||||
let compute_state = compute.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
|
||||
info!("syncing safekeepers on shutdown");
|
||||
let storage_auth_token = pspec.storage_auth_token.clone();
|
||||
let lsn = compute.sync_safekeepers(storage_auth_token)?;
|
||||
info!("synced safekeepers at lsn {lsn}");
|
||||
}
|
||||
|
||||
if let Err(err) = compute.check_for_core_dumps() {
|
||||
error!("error while checking for core dumps: {err:?}");
|
||||
}
|
||||
@@ -369,12 +359,6 @@ fn cli() -> clap::Command {
|
||||
.long("control-plane-uri")
|
||||
.value_name("CONTROL_PLANE_API_BASE_URI"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("remote-ext-config")
|
||||
.short('r')
|
||||
.long("remote-ext-config")
|
||||
.value_name("REMOTE_EXT_CONFIG"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Condvar, Mutex, OnceLock};
|
||||
use std::sync::{Condvar, Mutex};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use tokio;
|
||||
use tokio_postgres;
|
||||
use tracing::{info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -17,12 +16,11 @@ use utils::lsn::Lsn;
|
||||
|
||||
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
|
||||
use crate::config;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
use crate::{config, extension_server};
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
@@ -30,7 +28,6 @@ pub struct ComputeNode {
|
||||
pub connstr: url::Url,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub pgversion: String,
|
||||
/// We should only allow live re- / configuration of the compute node if
|
||||
/// it uses 'pull model', i.e. it can go to control-plane and fetch
|
||||
/// the latest configuration. Otherwise, there could be a case:
|
||||
@@ -50,10 +47,6 @@ pub struct ComputeNode {
|
||||
pub state: Mutex<ComputeState>,
|
||||
/// `Condvar` to allow notifying waiters about state changes.
|
||||
pub state_changed: Condvar,
|
||||
/// S3 extensions configuration variables
|
||||
pub ext_remote_storage: Option<GenericRemoteStorage>,
|
||||
pub available_libraries: OnceLock<HashMap<String, RemotePath>>,
|
||||
pub available_extensions: OnceLock<HashMap<String, Vec<RemotePath>>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -149,14 +142,14 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
.cluster
|
||||
.roles
|
||||
.iter()
|
||||
.map(|r| format!("'{}'", escape_literal(&r.name)))
|
||||
.map(|r| escape_literal(&r.name))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let dbs = spec
|
||||
.cluster
|
||||
.databases
|
||||
.iter()
|
||||
.map(|db| format!("'{}'", escape_literal(&db.name)))
|
||||
.map(|db| escape_literal(&db.name))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let roles_decl = if roles.is_empty() {
|
||||
@@ -244,7 +237,7 @@ impl ComputeNode {
|
||||
|
||||
// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
#[instrument(skip(self, compute_state))]
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let start_time = Utc::now();
|
||||
@@ -262,20 +255,52 @@ impl ComputeNode {
|
||||
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn),
|
||||
// HACK We don't use compression on first start (Lsn(0)) because there's no API for it
|
||||
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
|
||||
_ => format!(
|
||||
"basebackup {} {} {} --gzip",
|
||||
spec.tenant_id, spec.timeline_id, lsn
|
||||
),
|
||||
};
|
||||
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
let mut measured_reader = MeasuredReader::new(copyreader);
|
||||
|
||||
// Check the magic number to see if it's a gzip or not. Even though
|
||||
// we might explicitly ask for gzip, an old pageserver with no implementation
|
||||
// of gzip compression might send us uncompressed data. After some time
|
||||
// passes we can assume all pageservers know how to compress and we can
|
||||
// delete this check.
|
||||
//
|
||||
// If the data is not gzip, it will be tar. It will not be mistakenly
|
||||
// recognized as gzip because tar starts with an ascii encoding of a filename,
|
||||
// and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
|
||||
// we send the "global" directory first from the pageserver, so it definitely
|
||||
// won't be recognized as gzip.
|
||||
let mut bufreader = std::io::BufReader::new(&mut measured_reader);
|
||||
let gzip = {
|
||||
let peek = bufreader.fill_buf().unwrap();
|
||||
peek[0] == 0x1f && peek[1] == 0x8b
|
||||
};
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
if gzip {
|
||||
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
} else {
|
||||
let mut ar = tar::Archive::new(&mut bufreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
};
|
||||
|
||||
// Report metrics
|
||||
self.state.lock().unwrap().metrics.basebackup_bytes =
|
||||
measured_reader.get_byte_count() as u64;
|
||||
self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
@@ -286,8 +311,8 @@ impl ComputeNode {
|
||||
|
||||
// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
// and return the reported LSN back to the caller.
|
||||
#[instrument(skip(self, storage_auth_token))]
|
||||
fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
|
||||
#[instrument(skip_all)]
|
||||
pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let sync_handle = Command::new(&self.pgbin)
|
||||
@@ -331,23 +356,15 @@ impl ComputeNode {
|
||||
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
#[instrument(skip(self, compute_state))]
|
||||
pub fn prepare_pgdata(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
extension_server_port: u16,
|
||||
) -> Result<()> {
|
||||
#[instrument(skip_all)]
|
||||
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(
|
||||
&pgdata_path.join("postgresql.conf"),
|
||||
&pspec.spec,
|
||||
Some(extension_server_port),
|
||||
)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
// is already connected it will be kicked out, so a secondary (standby)
|
||||
@@ -397,7 +414,7 @@ impl ComputeNode {
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
#[instrument(skip(self))]
|
||||
#[instrument(skip_all)]
|
||||
pub fn start_postgres(
|
||||
&self,
|
||||
storage_auth_token: Option<String>,
|
||||
@@ -421,7 +438,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
/// Do initial configuration of the already started Postgres.
|
||||
#[instrument(skip(self, compute_state))]
|
||||
#[instrument(skip_all)]
|
||||
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
// If connection fails,
|
||||
// it may be the old node with `zenith_admin` superuser.
|
||||
@@ -475,7 +492,7 @@ impl ComputeNode {
|
||||
// We could've wrapped this around `pg_ctl reload`, but right now we don't use
|
||||
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
|
||||
// have opened connection to Postgres and superuser access.
|
||||
#[instrument(skip(self, client))]
|
||||
#[instrument(skip_all)]
|
||||
fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
|
||||
client.simple_query("SELECT pg_reload_conf()")?;
|
||||
Ok(())
|
||||
@@ -483,13 +500,13 @@ impl ComputeNode {
|
||||
|
||||
/// Similar to `apply_config()`, but does a bit different sequence of operations,
|
||||
/// as it's used to reconfigure a previously started and configured Postgres node.
|
||||
#[instrument(skip(self))]
|
||||
#[instrument(skip_all)]
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
|
||||
|
||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||
self.pg_reload_conf(&mut client)?;
|
||||
@@ -518,8 +535,8 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
|
||||
#[instrument(skip_all)]
|
||||
pub fn start_compute(&self) -> Result<std::process::Child> {
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
@@ -530,14 +547,12 @@ impl ComputeNode {
|
||||
pspec.timeline_id,
|
||||
);
|
||||
|
||||
self.prepare_external_extensions(&compute_state)?;
|
||||
|
||||
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
||||
self.prepare_pgdata(&compute_state)?;
|
||||
|
||||
let start_time = Utc::now();
|
||||
|
||||
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
|
||||
let config_time = Utc::now();
|
||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||
self.apply_config(&compute_state)?;
|
||||
}
|
||||
@@ -545,11 +560,16 @@ impl ComputeNode {
|
||||
let startup_end_time = Utc::now();
|
||||
{
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.config_ms = startup_end_time
|
||||
state.metrics.start_postgres_ms = config_time
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
state.metrics.config_ms = startup_end_time
|
||||
.signed_duration_since(config_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
state.metrics.total_startup_ms = startup_end_time
|
||||
.signed_duration_since(compute_state.start_time)
|
||||
.to_std()
|
||||
@@ -563,6 +583,13 @@ impl ComputeNode {
|
||||
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
|
||||
);
|
||||
|
||||
// Log metrics so that we can search for slow operations in logs
|
||||
let metrics = {
|
||||
let state = self.state.lock().unwrap();
|
||||
state.metrics.clone()
|
||||
};
|
||||
info!(?metrics, "compute start finished");
|
||||
|
||||
Ok(pg)
|
||||
}
|
||||
|
||||
@@ -668,131 +695,4 @@ LIMIT 100",
|
||||
"{{\"pg_stat_statements\": []}}".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
// If remote extension storage is configured,
|
||||
// download extension control files
|
||||
// and shared preload libraries.
|
||||
#[tokio::main]
|
||||
pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
if let Some(ref ext_remote_storage) = self.ext_remote_storage {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
// download preload shared libraries before postgres start (if any)
|
||||
let spec = &pspec.spec;
|
||||
|
||||
// 1. parse private extension paths from spec
|
||||
let private_ext_prefixes = match &spec.private_extensions {
|
||||
Some(private_extensions) => private_extensions.clone(),
|
||||
None => Vec::new(),
|
||||
};
|
||||
|
||||
info!("private_ext_prefixes: {:?}", &private_ext_prefixes);
|
||||
|
||||
// 2. parse shared_preload_libraries from spec
|
||||
let mut libs_vec = Vec::new();
|
||||
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
|
||||
info!(
|
||||
"shared_preload_libraries parsed from spec.cluster.settings: {:?}",
|
||||
libs_vec
|
||||
);
|
||||
|
||||
// also parse shared_preload_libraries from provided postgresql.conf
|
||||
// that is used in neon_local and python tests
|
||||
if let Some(conf) = &spec.cluster.postgresql_conf {
|
||||
let conf_lines = conf.split('\n').collect::<Vec<&str>>();
|
||||
|
||||
let mut shared_preload_libraries_line = "";
|
||||
for line in conf_lines {
|
||||
if line.starts_with("shared_preload_libraries") {
|
||||
shared_preload_libraries_line = line;
|
||||
}
|
||||
}
|
||||
|
||||
let mut preload_libs_vec = Vec::new();
|
||||
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
|
||||
preload_libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
|
||||
info!(
|
||||
"shared_preload_libraries parsed from spec.cluster.postgresql_conf: {:?}",
|
||||
preload_libs_vec
|
||||
);
|
||||
|
||||
libs_vec.extend(preload_libs_vec);
|
||||
}
|
||||
|
||||
// download extension control files & shared_preload_libraries
|
||||
|
||||
let available_extensions = extension_server::get_available_extensions(
|
||||
ext_remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
&private_ext_prefixes,
|
||||
)
|
||||
.await?;
|
||||
self.available_extensions
|
||||
.set(available_extensions)
|
||||
.expect("available_extensions.set error");
|
||||
|
||||
info!("Libraries to download: {:?}", &libs_vec);
|
||||
let available_libraries = extension_server::get_available_libraries(
|
||||
ext_remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
&private_ext_prefixes,
|
||||
&libs_vec,
|
||||
)
|
||||
.await?;
|
||||
self.available_libraries
|
||||
.set(available_libraries)
|
||||
.expect("available_libraries.set error");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn download_extension_sql_files(&self, filename: String) -> Result<()> {
|
||||
match &self.ext_remote_storage {
|
||||
None => anyhow::bail!("No remote extension storage"),
|
||||
Some(remote_storage) => {
|
||||
extension_server::download_extension_sql_files(
|
||||
&filename,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
self.available_extensions
|
||||
.get()
|
||||
.context("available_extensions broke")?,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn download_library_file(&self, filename: String) -> Result<()> {
|
||||
match &self.ext_remote_storage {
|
||||
None => anyhow::bail!("No remote extension storage"),
|
||||
Some(remote_storage) => {
|
||||
extension_server::download_library_file(
|
||||
&filename,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
self.available_libraries
|
||||
.get()
|
||||
.context("available_libraries broke")?,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,11 +33,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
}
|
||||
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
pub fn write_postgres_conf(
|
||||
path: &Path,
|
||||
spec: &ComputeSpec,
|
||||
extension_server_port: Option<u16>,
|
||||
) -> Result<()> {
|
||||
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
// File::create() destroys the file content if it exists.
|
||||
let mut file = File::create(path)?;
|
||||
|
||||
@@ -51,30 +47,22 @@ pub fn write_postgres_conf(
|
||||
// Add options for connecting to storage
|
||||
writeln!(file, "# Neon storage settings")?;
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.pageserver_connstring='{}'",
|
||||
escape_conf_value(s)
|
||||
)?;
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.safekeepers='{}'",
|
||||
"neon.safekeepers={}",
|
||||
escape_conf_value(&spec.safekeeper_connstrings.join(","))
|
||||
)?;
|
||||
}
|
||||
if let Some(s) = &spec.tenant_id {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.tenant_id='{}'",
|
||||
escape_conf_value(&s.to_string())
|
||||
)?;
|
||||
writeln!(file, "neon.tenant_id={}", escape_conf_value(&s.to_string()))?;
|
||||
}
|
||||
if let Some(s) = &spec.timeline_id {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.timeline_id='{}'",
|
||||
"neon.timeline_id={}",
|
||||
escape_conf_value(&s.to_string())
|
||||
)?;
|
||||
}
|
||||
@@ -99,9 +87,5 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
}
|
||||
|
||||
if let Some(port) = extension_server_port {
|
||||
writeln!(file, "neon.extension_server_port={}", port)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ use compute_api::responses::ComputeStatus;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
#[instrument(skip(compute))]
|
||||
#[instrument(skip_all)]
|
||||
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
info!("waiting for reconfiguration requests");
|
||||
loop {
|
||||
|
||||
@@ -1,379 +0,0 @@
|
||||
// Download extension files from the extension store
|
||||
// and put them in the right place in the postgres directory
|
||||
use anyhow::{self, bail, Context, Result};
|
||||
use remote_storage::*;
|
||||
use serde_json::{self, Value};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::info;
|
||||
|
||||
const SHARE_EXT_PATH: &str = "share/postgresql/extension";
|
||||
|
||||
fn get_pg_config(argument: &str, pgbin: &str) -> String {
|
||||
// gives the result of `pg_config [argument]`
|
||||
// where argument is a flag like `--version` or `--sharedir`
|
||||
let pgconfig = pgbin.replace("postgres", "pg_config");
|
||||
let config_output = std::process::Command::new(pgconfig)
|
||||
.arg(argument)
|
||||
.output()
|
||||
.expect("pg_config error");
|
||||
std::str::from_utf8(&config_output.stdout)
|
||||
.expect("pg_config error")
|
||||
.trim()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
pub fn get_pg_version(pgbin: &str) -> String {
|
||||
// pg_config --version returns a (platform specific) human readable string
|
||||
// such as "PostgreSQL 15.4". We parse this to v14/v15
|
||||
let human_version = get_pg_config("--version", pgbin);
|
||||
if human_version.contains("15") {
|
||||
return "v15".to_string();
|
||||
} else if human_version.contains("14") {
|
||||
return "v14".to_string();
|
||||
}
|
||||
panic!("Unsuported postgres version {human_version}");
|
||||
}
|
||||
|
||||
async fn download_helper(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
remote_from_path: &RemotePath,
|
||||
remote_from_prefix: Option<&Path>,
|
||||
download_location: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
// downloads file at remote_from_path to download_location/[file_name]
|
||||
|
||||
// we cannot use remote_from_path.object_name() here
|
||||
// because extension files can be in subdirectories of the extension store.
|
||||
//
|
||||
// To handle this, we use remote_from_prefix to strip the prefix from the path
|
||||
// this gives us the relative path of the file in the extension store,
|
||||
// and we use this relative path to construct the local path.
|
||||
//
|
||||
let local_path = match remote_from_prefix {
|
||||
Some(prefix) => {
|
||||
let p = remote_from_path
|
||||
.get_path()
|
||||
.strip_prefix(prefix)
|
||||
.expect("bad prefix");
|
||||
|
||||
download_location.join(p)
|
||||
}
|
||||
None => download_location.join(remote_from_path.object_name().expect("bad object")),
|
||||
};
|
||||
|
||||
if local_path.exists() {
|
||||
info!("File {:?} already exists. Skipping download", &local_path);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!(
|
||||
"Downloading {:?} to location {:?}",
|
||||
&remote_from_path, &local_path
|
||||
);
|
||||
let mut download = remote_storage.download(remote_from_path).await?;
|
||||
let mut write_data_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut write_data_buffer)
|
||||
.await?;
|
||||
if remote_from_prefix.is_some() {
|
||||
if let Some(prefix) = local_path.parent() {
|
||||
info!(
|
||||
"Downloading file with prefix. Create directory {:?}",
|
||||
prefix
|
||||
);
|
||||
// if directory already exists, this is a no-op
|
||||
std::fs::create_dir_all(prefix)?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_file = BufWriter::new(File::create(local_path)?);
|
||||
output_file.write_all(&write_data_buffer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// download extension control files
|
||||
//
|
||||
// if private_ext_prefixes is provided - search also in private extension paths
|
||||
//
|
||||
pub async fn get_available_extensions(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
private_ext_prefixes: &Vec<String>,
|
||||
) -> anyhow::Result<HashMap<String, Vec<RemotePath>>> {
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
|
||||
let mut paths: Vec<RemotePath> = Vec::new();
|
||||
// public extensions
|
||||
paths.push(RemotePath::new(
|
||||
&Path::new(pg_version).join(SHARE_EXT_PATH),
|
||||
)?);
|
||||
// private extensions
|
||||
for private_prefix in private_ext_prefixes {
|
||||
paths.push(RemotePath::new(
|
||||
&Path::new(pg_version)
|
||||
.join(private_prefix)
|
||||
.join(SHARE_EXT_PATH),
|
||||
)?);
|
||||
}
|
||||
|
||||
let all_available_files = list_files_in_prefixes_for_extensions(remote_storage, &paths).await?;
|
||||
|
||||
info!(
|
||||
"list of available_extension files {:?}",
|
||||
&all_available_files
|
||||
);
|
||||
|
||||
// download all control files
|
||||
for (obj_name, obj_paths) in &all_available_files {
|
||||
for obj_path in obj_paths {
|
||||
if obj_name.ends_with("control") {
|
||||
download_helper(remote_storage, obj_path, None, &local_sharedir).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(all_available_files)
|
||||
}
|
||||
|
||||
// Download requested shared_preload_libraries
|
||||
//
|
||||
// Note that tenant_id is not optional here, because we only download libraries
|
||||
// after we know the tenant spec and the tenant_id.
|
||||
//
|
||||
// return list of all library files to use it in the future searches
|
||||
pub async fn get_available_libraries(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
private_ext_prefixes: &Vec<String>,
|
||||
preload_libraries: &Vec<String>,
|
||||
) -> anyhow::Result<HashMap<String, RemotePath>> {
|
||||
let local_libdir: PathBuf = Path::new(&get_pg_config("--pkglibdir", pgbin)).into();
|
||||
// Construct a hashmap of all available libraries
|
||||
// example (key, value) pair: test_lib0.so, v14/lib/test_lib0.so
|
||||
|
||||
let mut paths: Vec<RemotePath> = Vec::new();
|
||||
// public libraries
|
||||
paths.push(
|
||||
RemotePath::new(&Path::new(&pg_version).join("lib/"))
|
||||
.expect("The hard coded path here is valid"),
|
||||
);
|
||||
// private libraries
|
||||
for private_prefix in private_ext_prefixes {
|
||||
paths.push(
|
||||
RemotePath::new(&Path::new(&pg_version).join(private_prefix).join("lib"))
|
||||
.expect("The hard coded path here is valid"),
|
||||
);
|
||||
}
|
||||
|
||||
let all_available_libraries = list_files_in_prefixes(remote_storage, &paths).await?;
|
||||
|
||||
info!("list of library files {:?}", &all_available_libraries);
|
||||
|
||||
// download all requested libraries
|
||||
for lib_name in preload_libraries {
|
||||
// add file extension if it isn't in the filename
|
||||
let lib_name_with_ext = enforce_so_end(lib_name);
|
||||
info!("looking for library {:?}", &lib_name_with_ext);
|
||||
|
||||
match all_available_libraries.get(&*lib_name_with_ext) {
|
||||
Some(remote_path) => {
|
||||
download_helper(remote_storage, remote_path, None, &local_libdir).await?
|
||||
}
|
||||
None => {
|
||||
let file_path = local_libdir.join(&lib_name_with_ext);
|
||||
if file_path.exists() {
|
||||
info!("File {:?} already exists. Skipping download", &file_path);
|
||||
} else {
|
||||
bail!("Shared library file {lib_name} is not found in the extension store")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(all_available_libraries)
|
||||
}
|
||||
|
||||
// download all sqlfiles (and possibly data files) for a given extension name
|
||||
//
|
||||
pub async fn download_extension_sql_files(
|
||||
ext_name: &str,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
all_available_files: &HashMap<String, Vec<RemotePath>>,
|
||||
) -> Result<()> {
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
let mut downloaded_something = false;
|
||||
|
||||
if let Some(files) = all_available_files.get(ext_name) {
|
||||
for file in files {
|
||||
if file.extension().context("bad file name")? != "control" {
|
||||
// find files prefix to handle cases when extension files are stored
|
||||
// in a directory with the same name as the extension
|
||||
// example:
|
||||
// share/postgresql/extension/extension_name/extension_name--1.0.sql
|
||||
let index = file
|
||||
.get_path()
|
||||
.to_str()
|
||||
.context("invalid path")?
|
||||
.find(ext_name)
|
||||
.context("invalid path")?;
|
||||
|
||||
let prefix_str =
|
||||
file.get_path().to_str().context("invalid path")?[..index].to_string();
|
||||
let remote_from_prefix = if prefix_str.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Path::new(&prefix_str))
|
||||
};
|
||||
|
||||
download_helper(remote_storage, file, remote_from_prefix, &local_sharedir).await?;
|
||||
downloaded_something = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if !downloaded_something {
|
||||
bail!("Files for extension {ext_name} are not found in the extension store");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// appends an .so suffix to libname if it does not already have one
|
||||
fn enforce_so_end(libname: &str) -> String {
|
||||
if !libname.ends_with(".so") {
|
||||
format!("{}.so", libname)
|
||||
} else {
|
||||
libname.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
// download shared library file
|
||||
pub async fn download_library_file(
|
||||
lib_name: &str,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
all_available_libraries: &HashMap<String, RemotePath>,
|
||||
) -> Result<()> {
|
||||
let local_libdir: PathBuf = Path::new(&get_pg_config("--pkglibdir", pgbin)).into();
|
||||
let lib_name_with_ext = enforce_so_end(lib_name);
|
||||
info!("looking for library {:?}", &lib_name_with_ext);
|
||||
match all_available_libraries.get(&*lib_name_with_ext) {
|
||||
Some(remote_path) => {
|
||||
download_helper(remote_storage, remote_path, None, &local_libdir).await?
|
||||
}
|
||||
None => bail!("Shared library file {lib_name} is not found in the extension store"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
|
||||
let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
|
||||
let remote_ext_bucket = match &remote_ext_config["bucket"] {
|
||||
Value::String(x) => x,
|
||||
_ => bail!("remote_ext_config missing bucket"),
|
||||
};
|
||||
let remote_ext_region = match &remote_ext_config["region"] {
|
||||
Value::String(x) => x,
|
||||
_ => bail!("remote_ext_config missing region"),
|
||||
};
|
||||
let remote_ext_endpoint = match &remote_ext_config["endpoint"] {
|
||||
Value::String(x) => Some(x.clone()),
|
||||
_ => None,
|
||||
};
|
||||
let remote_ext_prefix = match &remote_ext_config["prefix"] {
|
||||
Value::String(x) => Some(x.clone()),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// load will not be large, so default parameters are fine
|
||||
let config = S3Config {
|
||||
bucket_name: remote_ext_bucket.to_string(),
|
||||
bucket_region: remote_ext_region.to_string(),
|
||||
prefix_in_bucket: remote_ext_prefix,
|
||||
endpoint: remote_ext_endpoint,
|
||||
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_keys_per_list_response: None,
|
||||
};
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
|
||||
storage: RemoteStorageKind::AwsS3(config),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config)
|
||||
}
|
||||
|
||||
// helper to collect all files in the given prefixes
|
||||
// returns hashmap of (file_name, file_remote_path)
|
||||
async fn list_files_in_prefixes(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
paths: &Vec<RemotePath>,
|
||||
) -> Result<HashMap<String, RemotePath>> {
|
||||
let mut res = HashMap::new();
|
||||
|
||||
for path in paths {
|
||||
for file in remote_storage.list_files(Some(path)).await? {
|
||||
res.insert(
|
||||
file.object_name().expect("bad object").to_owned(),
|
||||
file.to_owned(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
// helper to extract extension name
|
||||
// extension files can be in subdirectories of the extension store.
|
||||
// examples of layout:
|
||||
//
|
||||
// share/postgresql/extension/extension_name--1.0.sql
|
||||
//
|
||||
// or
|
||||
//
|
||||
// share/postgresql/extension/extension_name/extension_name--1.0.sql
|
||||
// share/postgresql/extension/extension_name/extra_data.csv
|
||||
//
|
||||
// Note: we **assume** that the extension files is in one of these formats.
|
||||
// If it is not, this code will not download it.
|
||||
fn get_ext_name(path: &str) -> Result<&str> {
|
||||
let path_suffix: Vec<&str> = path.split(&format!("{SHARE_EXT_PATH}/")).collect();
|
||||
|
||||
let path_suffix = path_suffix.last().expect("bad ext name");
|
||||
// the order of these is important
|
||||
// otherwise we'll return incorrect extension name
|
||||
// for path like share/postgresql/extension/extension_name/extension_name--1.0.sql
|
||||
for index in ["/", "--"] {
|
||||
if let Some(index) = path_suffix.find(index) {
|
||||
return Ok(&path_suffix[..index]);
|
||||
}
|
||||
}
|
||||
Ok(path_suffix)
|
||||
}
|
||||
|
||||
// helper to collect files of given prefixes for extensions
|
||||
// and group them by extension
|
||||
// returns a hashmap of (extension_name, Vector of remote paths for all files needed for this extension)
|
||||
async fn list_files_in_prefixes_for_extensions(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
paths: &Vec<RemotePath>,
|
||||
) -> Result<HashMap<String, Vec<RemotePath>>> {
|
||||
let mut result = HashMap::new();
|
||||
for path in paths {
|
||||
for file in remote_storage.list_files(Some(path)).await? {
|
||||
let file_ext_name = get_ext_name(file.get_path().to_str().context("invalid path")?)?;
|
||||
let ext_file_list = result
|
||||
.entry(file_ext_name.to_string())
|
||||
.or_insert(Vec::new());
|
||||
ext_file_list.push(file.to_owned());
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
@@ -121,62 +121,8 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from S3 on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
info!("req.uri {:?}", req.uri());
|
||||
|
||||
let mut is_library = false;
|
||||
|
||||
if let Some(params) = req.uri().query() {
|
||||
info!("serving {:?} POST request with params: {}", route, params);
|
||||
|
||||
if params == "is_library=true" {
|
||||
is_library = true;
|
||||
} else {
|
||||
let mut resp = Response::new(Body::from("Wrong request parameters"));
|
||||
*resp.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return resp;
|
||||
}
|
||||
}
|
||||
|
||||
let filename = route.split('/').last().unwrap().to_string();
|
||||
|
||||
info!(
|
||||
"serving /extension_server POST request, filename: {:?} is_library: {}",
|
||||
filename, is_library
|
||||
);
|
||||
|
||||
if is_library {
|
||||
match compute.download_library_file(filename.to_string()).await {
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("library download failed: {}", e);
|
||||
let mut resp = Response::new(Body::from(e.to_string()));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
}
|
||||
} else {
|
||||
match compute
|
||||
.download_extension_sql_files(filename.to_string())
|
||||
.await
|
||||
{
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("extension download failed: {}", e);
|
||||
let mut resp = Response::new(Body::from(e.to_string()));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
method => {
|
||||
info!("404 Not Found for {:?}", method);
|
||||
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
*not_found.status_mut() = StatusCode::NOT_FOUND;
|
||||
not_found
|
||||
|
||||
@@ -139,34 +139,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
/extension_server:
|
||||
post:
|
||||
tags:
|
||||
- Extension
|
||||
summary: Download extension from S3 to local folder.
|
||||
description: ""
|
||||
operationId: downloadExtension
|
||||
responses:
|
||||
200:
|
||||
description: Extension downloaded
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'OK' if download succeeded.
|
||||
example: "OK"
|
||||
400:
|
||||
description: Request is invalid.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
500:
|
||||
description: Extension download request failed.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
|
||||
@@ -9,7 +9,6 @@ pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod extension_server;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
|
||||
@@ -18,6 +18,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
|
||||
|
||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
||||
.with_ansi(false)
|
||||
.with_target(false)
|
||||
.with_writer(std::io::stderr);
|
||||
|
||||
|
||||
@@ -16,15 +16,26 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Escape a string for including it in a SQL literal
|
||||
/// Escape a string for including it in a SQL literal. Wrapping the result
|
||||
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
|
||||
/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
|
||||
/// for the original implementation.
|
||||
pub fn escape_literal(s: &str) -> String {
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
let res = s.replace('\'', "''").replace('\\', "\\\\");
|
||||
|
||||
if res.contains('\\') {
|
||||
format!("E'{}'", res)
|
||||
} else {
|
||||
format!("'{}'", res)
|
||||
}
|
||||
}
|
||||
|
||||
/// Escape a string so that it can be used in postgresql.conf.
|
||||
/// Same as escape_literal, currently.
|
||||
/// Escape a string so that it can be used in postgresql.conf. Wrapping the result
|
||||
/// with `'{}'` is not required, as it returns a ready-to-use config string.
|
||||
pub fn escape_conf_value(s: &str) -> String {
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
let res = s.replace('\'', "''").replace('\\', "\\\\");
|
||||
format!("'{}'", res)
|
||||
}
|
||||
|
||||
trait GenericOptionExt {
|
||||
@@ -37,7 +48,7 @@ impl GenericOptionExt for GenericOption {
|
||||
fn to_pg_option(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} '{}'", self.name, escape_literal(val)),
|
||||
"string" => format!("{} {}", self.name, escape_literal(val)),
|
||||
_ => format!("{} {}", self.name, val),
|
||||
}
|
||||
} else {
|
||||
@@ -49,7 +60,7 @@ impl GenericOptionExt for GenericOption {
|
||||
fn to_pg_setting(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
|
||||
"string" => format!("{} = {}", self.name, escape_conf_value(val)),
|
||||
_ => format!("{} = {}", self.name, val),
|
||||
}
|
||||
} else {
|
||||
@@ -215,7 +226,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
|
||||
/// Wait for Postgres to become ready to accept connections. It's ready to
|
||||
/// accept connections when the state-field in `pgdata/postmaster.pid` says
|
||||
/// 'ready'.
|
||||
#[instrument(skip(pg))]
|
||||
#[instrument(skip_all, fields(pgdata = %pgdata.display()))]
|
||||
pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
|
||||
let pid_path = pgdata.join("postmaster.pid");
|
||||
|
||||
|
||||
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
|
||||
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
||||
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
||||
// always write all config into it creating new file.
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
@@ -397,10 +397,44 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
// We do not check either DB exists or not,
|
||||
// Postgres will take care of it for us
|
||||
"delete_db" => {
|
||||
let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote());
|
||||
// In Postgres we can't drop a database if it is a template.
|
||||
// So we need to unset the template flag first, but it could
|
||||
// be a retry, so we could've already dropped the database.
|
||||
// Check that database exists first to make it idempotent.
|
||||
let unset_template_query: String = format!(
|
||||
"
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_database
|
||||
WHERE datname = {}
|
||||
)
|
||||
THEN
|
||||
ALTER DATABASE {} is_template false;
|
||||
END IF;
|
||||
END
|
||||
$$;",
|
||||
escape_literal(&op.name),
|
||||
&op.name.pg_quote()
|
||||
);
|
||||
// Use FORCE to drop database even if there are active connections.
|
||||
// We run this from `cloud_admin`, so it should have enough privileges.
|
||||
// NB: there could be other db states, which prevent us from dropping
|
||||
// the database. For example, if db is used by any active subscription
|
||||
// or replication slot.
|
||||
// TODO: deal with it once we allow logical replication. Proper fix should
|
||||
// involve returning an error code to the control plane, so it could
|
||||
// figure out that this is a non-retryable error, return it to the user
|
||||
// and fail operation permanently.
|
||||
let drop_db_query: String = format!(
|
||||
"DROP DATABASE IF EXISTS {} WITH (FORCE)",
|
||||
&op.name.pg_quote()
|
||||
);
|
||||
|
||||
warn!("deleting database '{}'", &op.name);
|
||||
client.execute(query.as_str(), &[])?;
|
||||
client.execute(unset_template_query.as_str(), &[])?;
|
||||
client.execute(drop_db_query.as_str(), &[])?;
|
||||
}
|
||||
"rename_db" => {
|
||||
let new_name = op.new_name.as_ref().unwrap();
|
||||
|
||||
@@ -89,4 +89,12 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
|
||||
assert_eq!(none_generic_options.find("missed_value"), None);
|
||||
assert_eq!(none_generic_options.find("invalid_value"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_literal() {
|
||||
assert_eq!(escape_literal("test"), "'test'");
|
||||
assert_eq!(escape_literal("test'"), "'test'''");
|
||||
assert_eq!(escape_literal("test\\'"), "E'test\\\\'''");
|
||||
assert_eq!(escape_literal("test\\'\\'"), "E'test\\\\''\\\\'''");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,4 +32,3 @@ utils.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
|
||||
//! The pid stored in the file is later used to stop the service.
|
||||
//!
|
||||
//! See [`lock_file`] module for more info.
|
||||
//! See the [`lock_file`](utils::lock_file) module for more info.
|
||||
|
||||
use std::ffi::OsStr;
|
||||
use std::io::Write;
|
||||
@@ -180,6 +180,11 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
|
||||
}
|
||||
|
||||
// Wait until process is gone
|
||||
wait_until_stopped(process_name, pid)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
|
||||
for retries in 0..RETRIES {
|
||||
match process_has_stopped(pid) {
|
||||
Ok(true) => {
|
||||
|
||||
@@ -308,7 +308,8 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
|
||||
let mut env =
|
||||
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
|
||||
env.init(pg_version)
|
||||
let force = init_match.get_flag("force");
|
||||
env.init(pg_version, force)
|
||||
.context("Failed to initialize neon repository")?;
|
||||
|
||||
// Initialize pageserver, create initial tenant and timeline.
|
||||
@@ -657,8 +658,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
.get_one::<String>("endpoint_id")
|
||||
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
||||
|
||||
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
||||
|
||||
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
||||
let safekeepers =
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
@@ -700,7 +699,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
_ => {}
|
||||
}
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
endpoint.start(&auth_token, safekeepers)?;
|
||||
} else {
|
||||
let branch_name = sub_args
|
||||
.get_one::<String>("branch-name")
|
||||
@@ -744,7 +743,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
pg_version,
|
||||
mode,
|
||||
)?;
|
||||
ep.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
ep.start(&auth_token, safekeepers)?;
|
||||
}
|
||||
}
|
||||
"stop" => {
|
||||
@@ -1004,12 +1003,6 @@ fn cli() -> Command {
|
||||
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
|
||||
.required(false);
|
||||
|
||||
let remote_ext_config_args = Arg::new("remote-ext-config")
|
||||
.long("remote-ext-config")
|
||||
.num_args(1)
|
||||
.help("Configure the S3 bucket that we search for extensions in.")
|
||||
.required(false);
|
||||
|
||||
let lsn_arg = Arg::new("lsn")
|
||||
.long("lsn")
|
||||
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
|
||||
@@ -1021,6 +1014,13 @@ fn cli() -> Command {
|
||||
.help("If set, the node will be a hot replica on the specified timeline")
|
||||
.required(false);
|
||||
|
||||
let force_arg = Arg::new("force")
|
||||
.value_parser(value_parser!(bool))
|
||||
.long("force")
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Force initialization even if the repository is not empty")
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
@@ -1036,6 +1036,7 @@ fn cli() -> Command {
|
||||
.value_name("config"),
|
||||
)
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(force_arg)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("timeline")
|
||||
@@ -1160,7 +1161,6 @@ fn cli() -> Command {
|
||||
.arg(pg_version_arg)
|
||||
.arg(hot_standby_arg)
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
//!
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//!
|
||||
//! ```
|
||||
use anyhow::Context;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
//!
|
||||
//! In the local test environment, the data for each endpoint is stored in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/endpoints/<endpoint id>
|
||||
//! ```
|
||||
//!
|
||||
//! Some basic information about the endpoint, like the tenant and timeline IDs,
|
||||
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
|
||||
@@ -22,7 +24,7 @@
|
||||
//!
|
||||
//! Directory contents:
|
||||
//!
|
||||
//! ```ignore
|
||||
//! ```text
|
||||
//! .neon/endpoints/main/
|
||||
//! compute.log - log output of `compute_ctl` and `postgres`
|
||||
//! endpoint.json - serialized `EndpointConf` struct
|
||||
@@ -311,7 +313,7 @@ impl Endpoint {
|
||||
|
||||
// TODO: use future host field from safekeeper spec
|
||||
// Pass the list of safekeepers to the replica so that it can connect to any of them,
|
||||
// whichever is available.
|
||||
// whichever is availiable.
|
||||
let sk_ports = self
|
||||
.env
|
||||
.safekeepers
|
||||
@@ -405,15 +407,20 @@ impl Endpoint {
|
||||
String::from_utf8_lossy(&pg_ctl.stderr),
|
||||
);
|
||||
}
|
||||
|
||||
// Also wait for the compute_ctl process to die. It might have some cleanup
|
||||
// work to do after postgres stops, like syncing safekeepers, etc.
|
||||
//
|
||||
// TODO use background_process::stop_process instead
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
remote_ext_config: Option<&String>,
|
||||
) -> Result<()> {
|
||||
pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
|
||||
if self.status() == "running" {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
}
|
||||
@@ -481,7 +488,6 @@ impl Endpoint {
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
private_extensions: Some(vec![self.tenant_id.to_string()]), //DEBUG ONLY
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -513,10 +519,13 @@ impl Endpoint {
|
||||
.stdin(std::process::Stdio::null())
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
if let Some(remote_ext_config) = remote_ext_config {
|
||||
cmd.args(["--remote-ext-config", remote_ext_config]);
|
||||
}
|
||||
let _child = cmd.spawn()?;
|
||||
let child = cmd.spawn()?;
|
||||
|
||||
// Write down the pid so we can wait for it when we want to stop
|
||||
// TODO use background_process::start_process instead
|
||||
let pid = child.id();
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
std::fs::write(pidfile_path, pid.to_string())?;
|
||||
|
||||
// Wait for it to start
|
||||
let mut attempt = 0;
|
||||
|
||||
@@ -169,6 +169,7 @@ impl LocalEnv {
|
||||
match pg_version {
|
||||
14 => Ok(path.join(format!("v{pg_version}"))),
|
||||
15 => Ok(path.join(format!("v{pg_version}"))),
|
||||
16 => Ok(path.join(format!("v{pg_version}"))),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -177,6 +178,7 @@ impl LocalEnv {
|
||||
match pg_version {
|
||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
16 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -184,6 +186,7 @@ impl LocalEnv {
|
||||
match pg_version {
|
||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
16 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -364,7 +367,7 @@ impl LocalEnv {
|
||||
//
|
||||
// Initialize a new Neon repository
|
||||
//
|
||||
pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
|
||||
pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
|
||||
// check if config already exists
|
||||
let base_path = &self.base_data_dir;
|
||||
ensure!(
|
||||
@@ -372,11 +375,29 @@ impl LocalEnv {
|
||||
"repository base path is missing"
|
||||
);
|
||||
|
||||
ensure!(
|
||||
!base_path.exists(),
|
||||
"directory '{}' already exists. Perhaps already initialized?",
|
||||
base_path.display()
|
||||
);
|
||||
if base_path.exists() {
|
||||
if force {
|
||||
println!("removing all contents of '{}'", base_path.display());
|
||||
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
|
||||
// all contents inside. This helps if the developer symbol links another directory (i.e.,
|
||||
// S3 local SSD) to the `.neon` base directory.
|
||||
for entry in std::fs::read_dir(base_path)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
fs::remove_dir_all(&path)?;
|
||||
} else {
|
||||
fs::remove_file(&path)?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bail!(
|
||||
"directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
|
||||
base_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
@@ -392,7 +413,9 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
fs::create_dir(base_path)?;
|
||||
if !base_path.exists() {
|
||||
fs::create_dir(base_path)?;
|
||||
}
|
||||
|
||||
// Generate keypair for JWT.
|
||||
//
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
//!
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//!
|
||||
//! ```
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Child;
|
||||
|
||||
@@ -189,7 +189,7 @@ services:
|
||||
- "/bin/bash"
|
||||
- "-c"
|
||||
command:
|
||||
- "until pg_isready -h compute -p 55433 ; do
|
||||
- "until pg_isready -h compute -p 55433 -U cloud_admin ; do
|
||||
echo 'Waiting to start compute...' && sleep 1;
|
||||
done"
|
||||
depends_on:
|
||||
|
||||
@@ -48,6 +48,7 @@ Creating docker-compose_storage_broker_1 ... done
|
||||
2. connect compute node
|
||||
```
|
||||
$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
|
||||
$ chmod 600 ~/.pgpass
|
||||
$ psql -h localhost -p 55433 -U cloud_admin
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
|
||||
@@ -1,301 +0,0 @@
|
||||
# Supporting custom user Extensions
|
||||
|
||||
Created 2023-05-03
|
||||
|
||||
## Motivation
|
||||
|
||||
There are many extensions in the PostgreSQL ecosystem, and not all extensions
|
||||
are of a quality that we can confidently support them. Additionally, our
|
||||
current extension inclusion mechanism has several problems because we build all
|
||||
extensions into the primary Compute image: We build the extensions every time
|
||||
we build the compute image regardless of whether we actually need to rebuild
|
||||
the image, and the inclusion of these extensions in the image adds a hard
|
||||
dependency on all supported extensions - thus increasing the image size, and
|
||||
with it the time it takes to download that image - increasing first start
|
||||
latency.
|
||||
|
||||
This RFC proposes a dynamic loading mechanism that solves most of these
|
||||
problems.
|
||||
|
||||
## Summary
|
||||
|
||||
`compute_ctl` is made responsible for loading extensions on-demand into
|
||||
the container's file system for dynamically loaded extensions, and will also
|
||||
make sure that the extensions in `shared_preload_libraries` are downloaded
|
||||
before the compute node starts.
|
||||
|
||||
## Components
|
||||
|
||||
compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
|
||||
|
||||
## Requirements
|
||||
|
||||
Compute nodes with no extra extensions should not be negatively impacted by
|
||||
the existence of support for many extensions.
|
||||
|
||||
Installing an extension into PostgreSQL should be easy.
|
||||
|
||||
Non-preloaded extensions shouldn't impact startup latency.
|
||||
|
||||
Uninstalled extensions shouldn't impact query latency.
|
||||
|
||||
A small latency penalty for dynamically loaded extensions is acceptable in
|
||||
the first seconds of compute startup, but not in steady-state operations.
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
### On-demand, JIT-loading of extensions
|
||||
|
||||
TLDR; we download extensions as soon as we need them, or when we have spare
|
||||
time.
|
||||
|
||||
That means, we first download the extensions required to start the PostMaster
|
||||
(`shared_preload_libraries` and their dependencies), then the libraries required
|
||||
before a backend can start processing user input (`preload_libraries` and
|
||||
dependencies), and then (with network limits applied) the remainder of the
|
||||
configured extensions, with prioritization for installed extensions.
|
||||
|
||||
If PostgreSQL tries to load a library that is not yet fully on disk, it will
|
||||
ask `compute_ctl` first if the extension has been downloaded yet, and will wait
|
||||
for `compute_ctl` to finish downloading that extension. `compute_ctl` will
|
||||
prioritize downloading that extension over other extensions that were not yet
|
||||
requested.
|
||||
|
||||
#### Workflow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant EX as External (control plane, ...)
|
||||
participant CTL as compute_ctl
|
||||
participant ST as extension store
|
||||
actor PG as PostgreSQL
|
||||
|
||||
EX ->>+ CTL: Start compute with config X
|
||||
|
||||
note over CTL: The configuration contains a list of all <br/>extensions available to that compute node, etc.
|
||||
|
||||
par Optionally parallel or concurrent
|
||||
loop Available extensions
|
||||
CTL ->>+ ST: Download control file of extension
|
||||
activate CTL
|
||||
ST ->>- CTL: Finish downloading control file
|
||||
CTL ->>- CTL: Put control file in extensions directory
|
||||
end
|
||||
|
||||
loop For each extension in shared_preload_libraries
|
||||
CTL ->>+ ST: Download extension's data
|
||||
activate CTL
|
||||
ST ->>- CTL: Finish downloading
|
||||
CTL ->>- CTL: Put extension's files in the right place
|
||||
end
|
||||
end
|
||||
|
||||
CTL ->>+ PG: Start PostgreSQL
|
||||
|
||||
note over CTL: PostgreSQL can now start accepting <br/>connections. However, users may still need to wait <br/>for preload_libraries extensions to get downloaded.
|
||||
|
||||
par Load preload_libraries
|
||||
loop For each extension in preload_libraries
|
||||
CTL ->>+ ST: Download extension's data
|
||||
activate CTL
|
||||
ST ->>- CTL: Finish downloading
|
||||
CTL ->>- CTL: Put extension's files in the right place
|
||||
end
|
||||
end
|
||||
|
||||
note over CTL: After this, connections don't have any hard <br/>waits for extension files left, except for those <br/>connections that override preload_libraries <br/>in their startup packet
|
||||
|
||||
par PG's internal_load_library(library)
|
||||
alt Library is not yet loaded
|
||||
PG ->>+ CTL: Load library X
|
||||
CTL ->>+ ST: Download the extension that provides X
|
||||
ST ->>- CTL: Finish downloading
|
||||
CTL ->> CTL: Put extension's files in the right place
|
||||
CTL ->>- PG: Ready
|
||||
else Library is already loaded
|
||||
note over PG: No-op
|
||||
end
|
||||
and Download all remaining extensions
|
||||
loop Extension X
|
||||
CTL ->>+ ST: Download not-yet-downloaded extension X
|
||||
activate CTL
|
||||
ST ->>- CTL: Finish downloading
|
||||
CTL ->>- CTL: Put extension's files in the right place
|
||||
end
|
||||
end
|
||||
|
||||
deactivate PG
|
||||
deactivate CTL
|
||||
```
|
||||
|
||||
#### Summary
|
||||
|
||||
Pros:
|
||||
- Startup is only as slow as it takes to load all (shared_)preload_libraries
|
||||
- Supports BYO Extension
|
||||
|
||||
Cons:
|
||||
- O(sizeof(extensions)) IO requirement for loading all extensions.
|
||||
|
||||
### Alternative solutions
|
||||
|
||||
1. Allow users to add their extensions to the base image
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
|
||||
Cons:
|
||||
- Doesn't scale - first start size is dependent on image size;
|
||||
- All extensions are shared across all users: It doesn't allow users to
|
||||
bring their own restrictive-licensed extensions
|
||||
|
||||
2. Bring Your Own compute image
|
||||
|
||||
Pros:
|
||||
- Still easy to deploy
|
||||
- User can bring own patched version of PostgreSQL
|
||||
|
||||
Cons:
|
||||
- First start latency is O(sizeof(extensions image))
|
||||
- Warm instance pool for skipping pod schedule latency is not feasible with
|
||||
O(n) custom images
|
||||
- Support channels are difficult to manage
|
||||
|
||||
3. Download all user extensions in bulk on compute start
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- No startup latency issues for "clean" users.
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- Downloading all extensions in advance takes a lot of time, thus startup
|
||||
latency issues
|
||||
|
||||
4. Store user's extensions in persistent storage
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- No startup latency issues
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- EC2 instances have only limited number of attachments shared between EBS
|
||||
volumes, direct-attached NVMe drives, and ENIs.
|
||||
- Compute instance migration isn't trivially solved for EBS mounts (e.g.
|
||||
the device is unavailable whilst moving the mount between instances).
|
||||
- EBS can only mount on one instance at a time (except the expensive IO2
|
||||
device type).
|
||||
|
||||
5. Store user's extensions in network drive
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- Few startup latency issues
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- We'd need networked drives, and a lot of them, which would store many
|
||||
duplicate extensions.
|
||||
- **UNCHECKED:** Compute instance migration may not work nicely with
|
||||
networked IOs
|
||||
|
||||
|
||||
### Idea extensions
|
||||
|
||||
The extension store does not have to be S3 directly, but could be a Node-local
|
||||
caching service on top of S3. This would reduce the load on the network for
|
||||
popular extensions.
|
||||
|
||||
## Extension Store implementation
|
||||
|
||||
Extension Store in our case is a private S3 bucket.
|
||||
Extensions are stored as tarballs in the bucket. The tarball contains the extension's control file and all the files that the extension needs to run.
|
||||
|
||||
We may also store the control file separately from the tarball to speed up the extension loading.
|
||||
|
||||
`s3://<the-bucket>/extensions/ext-name/sha-256+1234abcd1234abcd1234abcd1234abcd/bundle.tar`
|
||||
|
||||
where `ext-name` is an extension name and `sha-256+1234abcd1234abcd1234abcd1234abcd` is a hash of a specific extension version tarball.
|
||||
|
||||
To ensure security, there is no direct access to the S3 bucket from compute node.
|
||||
|
||||
Control plane forms a list of extensions available to the compute node
|
||||
and forms a short-lived [pre-signed URL](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html)
|
||||
for each extension that is available to the compute node.
|
||||
|
||||
so, `compute_ctl` receives spec in the following format
|
||||
|
||||
```
|
||||
"extensions": [{
|
||||
"meta_format": 1,
|
||||
"extension_name": "postgis",
|
||||
"link": "https://<the-bucket>/extensions/sha-256+1234abcd1234abcd1234abcd1234abcd/bundle.tar?AWSAccessKeyId=1234abcd1234abcd1234abcd1234abcd&Expires=1234567890&Signature=1234abcd1234abcd1234abcd1234abcd",
|
||||
...
|
||||
}]
|
||||
```
|
||||
|
||||
`compute_ctl` then downloads the extension from the link and unpacks it to the right place.
|
||||
|
||||
### How do we handle private extensions?
|
||||
|
||||
Private and public extensions are treated equally from the Extension Store perspective.
|
||||
The only difference is that the private extensions are not listed in the user UI (managed by control plane).
|
||||
|
||||
### How to add new extension to the Extension Store?
|
||||
|
||||
Since we need to verify that the extension is compatible with the compute node and doesn't contain any malicious code,
|
||||
we need to review the extension before adding it to the Extension Store.
|
||||
|
||||
I do not expect that we will have a lot of extensions to review, so we can do it manually for now.
|
||||
|
||||
Some admin UI may be added later to automate this process.
|
||||
|
||||
The list of extensions available to a compute node is stored in the console database.
|
||||
|
||||
### How is the list of available extensions managed?
|
||||
|
||||
We need to add new tables to the console database to store the list of available extensions, their versions and access rights.
|
||||
|
||||
something like this:
|
||||
|
||||
```
|
||||
CREATE TABLE extensions (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
version VARCHAR(255) NOT NULL,
|
||||
hash VARCHAR(255) NOT NULL, // this is the path to the extension in the Extension Store
|
||||
supported_postgres_versions integer[] NOT NULL,
|
||||
is_public BOOLEAN NOT NULL, // public extensions are available to all users
|
||||
is_shared_preload BOOLEAN NOT NULL, // these extensions require postgres restart
|
||||
is_preload BOOLEAN NOT NULL,
|
||||
license VARCHAR(255) NOT NULL,
|
||||
);
|
||||
|
||||
CREATE TABLE user_extensions (
|
||||
user_id INTEGER NOT NULL,
|
||||
extension_id INTEGER NOT NULL,
|
||||
FOREIGN KEY (user_id) REFERENCES users (id),
|
||||
FOREIGN KEY (extension_id) REFERENCES extensions (id)
|
||||
);
|
||||
```
|
||||
|
||||
When new extension is added to the Extension Store, we add a new record to the table and set permissions.
|
||||
|
||||
In UI, user may select the extensions that they want to use with their compute node.
|
||||
|
||||
NOTE: Extensions that require postgres restart will not be available until the next compute restart.
|
||||
Also, currently user cannot force postgres restart. We should add this feature later.
|
||||
|
||||
For other extensions, we must communicate updates to `compute_ctl` and they will be downloaded in the background.
|
||||
|
||||
### How can user update the extension?
|
||||
|
||||
User can update the extension by selecting the new version of the extension in the UI.
|
||||
|
||||
### Alternatives
|
||||
|
||||
For extensions written on trusted languages we can also adopt
|
||||
`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
|
||||
This will increase the amount supported extensions and decrease the amount of work required to support them.
|
||||
84
docs/rfcs/024-user-mgmt.md
Normal file
84
docs/rfcs/024-user-mgmt.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# Postgres user and database management
|
||||
|
||||
(This supersedes the previous proposal that looked too complicated and desynchronization-prone)
|
||||
|
||||
We've accumulated a bunch of problems with our approach to role and database management, namely:
|
||||
|
||||
1. we don't allow role and database creation from Postgres, and users are complaining about that
|
||||
2. fine-grained role management is not possible both from Postgres and console
|
||||
|
||||
Right now, we do store users and databases both in console and Postgres, and there are two main reasons for
|
||||
that:
|
||||
|
||||
* we want to be able to authenticate users in proxy against the console without Postgres' involvement. Otherwise,
|
||||
malicious brute force attempts will wake up Postgres (expensive) and may exhaust the Postgres connections limit (deny of service).
|
||||
* it is handy when we can render console UI without waking up compute (e.g., show database list)
|
||||
|
||||
This RFC doesn't talk about giving root access to the database, which is blocked by a secure runtime setup.
|
||||
|
||||
## Overview
|
||||
|
||||
* Add Postgres extension that sends an HTTP request each time transaction that modifies users/databases is about to commit.
|
||||
* Add user management API to internal console API. Also, the console should put a JWT token into the compute so that it can access management API.
|
||||
|
||||
## Postgres behavior
|
||||
|
||||
The default user role (@username) should have `CREATE ROLE`, `CREATE DB`, and `BYPASSRLS` privileges. We expose the Postgres port
|
||||
to the open internet, so we need to check password strength. Now console generates strong passwords, so there is no risk of having dumb passwords. With user-provided passwords, such risks exist.
|
||||
|
||||
Since we store passwords in the console we should also send unencrypted password when role is created/changed. Hence communication with the console must be encrypted. Postgres also supports creating roles using hashes, in that case, we will not be able to get a raw password. So I can see the following options here:
|
||||
* roles created via SQL will *not* have raw passwords in the console
|
||||
* roles created via SQL will have raw passwords in the console, except ones that were created using hashes
|
||||
|
||||
I'm leaning towards the second option here as it is a bit more consistent one -- if raw password storage is enabled then we store passwords in all cases where we can store them.
|
||||
|
||||
To send data about roles and databases from Postgres to the console we can create the following Postgres extension:
|
||||
|
||||
* Intercept role/database changes in `ProcessUtility_hook`. Here we have access to the query statement with the raw password. The hook handler itself should not dial the console immediately and rather stash info in some hashmap for later use.
|
||||
* When the transaction is about to commit we execute collected role modifications (all as one -- console should either accept all or reject all, and hence API shouldn't be REST-like). If the console request fails we can roll back the transaction. This way if the transaction is committed we know for sure that console has this information. We can use `XACT_EVENT_PRE_COMMIT` and `XACT_EVENT_PARALLEL_PRE_COMMIT` for that.
|
||||
* Extension should be mindful of the fact that it is possible to create and delete roles within the transaction.
|
||||
* We also need to track who is database owner, some coding around may be needed to get the current user when the database is created.
|
||||
|
||||
## Console user management API
|
||||
|
||||
The current public API has REST API for role management. We need to have some analog for the internal API (called mgmt API in the console code). But unlike public API here we want to have an atomic way to create several roles/databases (in cases when several roles were created in the same transaction). So something like that may work:
|
||||
|
||||
```
|
||||
curl -X PATCH /api/v1/roles_and_databases -d '
|
||||
[
|
||||
{"op":"create", "type":"role", "name": "kurt", "password":"lYgT3BlbkFJ2vBZrqv"},
|
||||
{"op":"drop", "type":"role", "name": "trout"},
|
||||
{"op":"alter", "type":"role", "name": "kilgore", "password":"3BlbkFJ2vB"},
|
||||
{"op":"create", "type":"database", "name": "db2", "owner": "eliot"},
|
||||
]
|
||||
'
|
||||
```
|
||||
|
||||
Makes sense not to error out on duplicated create/delete operations (see failure modes)
|
||||
|
||||
## Managing users from the console
|
||||
|
||||
Now console puts a spec file with the list of databases/roles and delta operations in all the compute pods. `compute_ctl` then picks up that file and stubbornly executes deltas and checks data in the spec file is the same as in the Postgres. This way if the user creates a role in the UI we restart compute with a new spec file and during the start databases/roles are created. So if Postgres send an HTTP call each time role is created we need to break recursion in that case. We can do that based on application_name or some GUC or user (local == no HTTP hook).
|
||||
|
||||
Generally, we have several options when we are creating users via console:
|
||||
|
||||
1. restart compute with a new spec file, execute local SQL command; cut recursion in the extension
|
||||
2. "push" spec files into running compute, execute local SQL command; cut recursion in the extension
|
||||
3. "push" spec files into running compute, execute local SQL command; let extension create those roles in the console
|
||||
4. avoid managing roles via spec files, send SQL commands to compute; let extension create those roles in the console
|
||||
|
||||
The last option is the most straightforward one, but with the raw password storage opt-out, we will not have the password to establish an SQL connection. Also, we need a spec for provisioning purposes and to address potential desync (but that is quite unlikely). So I think the easiest approach would be:
|
||||
|
||||
1. keep role management like it is now and cut the recursion in the extension when SQL is executed by compute_ctl
|
||||
2. add "push" endpoint to the compute_ctl to avoid compute restart during the `apply_config` operation -- that can be done as a follow up to avoid increasing scope too much
|
||||
|
||||
## Failure modes
|
||||
|
||||
* during role creation via SQL role was created in the console but the connection was dropped before Postgres got acknowledgment or some error happened after acknowledgment (out of disk space, deadlock, etc):
|
||||
|
||||
in that case, Postgres won't have a role that exists in the console. Compute restart will heal it (due to the spec file). Also if the console allows repeated creation/deletion user can repeat the transaction.
|
||||
|
||||
|
||||
# Scalability
|
||||
|
||||
On my laptop, I can create 4200 roles per second. That corresponds to 363 million roles per day. Since each role creation ends up in the console database we can add some limit to the number of roles (could be reasonably big to not run into it often -- like 1k or 10k).
|
||||
22
docs/tools.md
Normal file
22
docs/tools.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Useful development tools
|
||||
|
||||
This readme contains some hints on how to set up some optional development tools.
|
||||
|
||||
## ccls
|
||||
|
||||
[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup
|
||||
to work well. There are different ways to do it but here's what works for me:
|
||||
1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`)
|
||||
2. Go to `vendor/postgres-v15`
|
||||
3. Run `make clean && ./configure`
|
||||
4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4`
|
||||
5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent)
|
||||
6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories
|
||||
|
||||
With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well.
|
||||
|
||||
Some additional tips for various IDEs:
|
||||
|
||||
### Emacs
|
||||
|
||||
To improve performance: `(setq lsp-lens-enable nil)`
|
||||
@@ -71,6 +71,8 @@ pub struct ComputeMetrics {
|
||||
pub wait_for_spec_ms: u64,
|
||||
pub sync_safekeepers_ms: u64,
|
||||
pub basebackup_ms: u64,
|
||||
pub basebackup_bytes: u64,
|
||||
pub start_postgres_ms: u64,
|
||||
pub config_ms: u64,
|
||||
pub total_startup_ms: u64,
|
||||
}
|
||||
|
||||
@@ -60,8 +60,6 @@ pub struct ComputeSpec {
|
||||
/// If set, 'storage_auth_token' is used as the password to authenticate to
|
||||
/// the pageserver and safekeepers.
|
||||
pub storage_auth_token: Option<String>,
|
||||
|
||||
pub private_extensions: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
|
||||
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
|
||||
|
||||
use std::{future::Future, time::Instant};
|
||||
|
||||
|
||||
@@ -411,12 +411,16 @@ pub struct LayerResidenceEvent {
|
||||
pub reason: LayerResidenceEventReason,
|
||||
}
|
||||
|
||||
/// The reason for recording a given [`ResidenceEvent`].
|
||||
/// The reason for recording a given [`LayerResidenceEvent`].
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub enum LayerResidenceEventReason {
|
||||
/// The layer map is being populated, e.g. during timeline load or attach.
|
||||
/// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
|
||||
/// We need to record such events because there is no persistent storage for the events.
|
||||
///
|
||||
// https://github.com/rust-lang/rust/issues/74481
|
||||
/// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
|
||||
/// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
|
||||
LayerLoad,
|
||||
/// We just created the layer (e.g., freeze_and_flush or compaction).
|
||||
/// Such layers are always [`LayerResidenceStatus::Resident`].
|
||||
|
||||
@@ -60,8 +60,9 @@ impl Ord for RelTag {
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// ```text
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
/// ```
|
||||
impl fmt::Display for RelTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||
|
||||
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
|
||||
PathBuf::from("pg_install")
|
||||
};
|
||||
|
||||
for pg_version in &["v14", "v15"] {
|
||||
for pg_version in &["v14", "v15", "v16"] {
|
||||
let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
|
||||
if pg_install_dir_versioned.is_relative() {
|
||||
let cwd = env::current_dir().context("Failed to get current_dir")?;
|
||||
|
||||
@@ -51,6 +51,7 @@ macro_rules! for_all_postgres_versions {
|
||||
($macro:tt) => {
|
||||
$macro!(v14);
|
||||
$macro!(v15);
|
||||
$macro!(v16);
|
||||
};
|
||||
}
|
||||
|
||||
@@ -92,9 +93,10 @@ pub use v14::bindings::DBState_DB_SHUTDOWNED;
|
||||
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
|
||||
match version {
|
||||
14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
|
||||
15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
|
||||
15 | 16 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
|
||||
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
|
||||
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
|
||||
|
||||
_ => anyhow::bail!("Unknown version {}", version),
|
||||
}
|
||||
}
|
||||
@@ -110,6 +112,7 @@ pub fn generate_wal_segment(
|
||||
match pg_version {
|
||||
14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||
15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||
16 => v16::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||
_ => Err(SerializeError::BadInput),
|
||||
}
|
||||
}
|
||||
@@ -123,6 +126,7 @@ pub fn generate_pg_control(
|
||||
match pg_version {
|
||||
14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||
15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||
16 => v16::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||
_ => anyhow::bail!("Unknown version {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -197,7 +201,7 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
|
||||
|
||||
pub mod waldecoder {
|
||||
|
||||
use crate::{v14, v15};
|
||||
use crate::{v14, v15, v16};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use std::num::NonZeroU32;
|
||||
use thiserror::Error;
|
||||
@@ -259,6 +263,10 @@ pub mod waldecoder {
|
||||
use self::v15::waldecoder_handler::WalStreamDecoderHandler;
|
||||
self.poll_decode_internal()
|
||||
}
|
||||
16 => {
|
||||
use self::v16::waldecoder_handler::WalStreamDecoderHandler;
|
||||
self.poll_decode_internal()
|
||||
}
|
||||
_ => Err(WalDecodeError {
|
||||
msg: format!("Unknown version {}", self.pg_version),
|
||||
lsn: self.lsn,
|
||||
|
||||
@@ -49,14 +49,16 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
|
||||
///
|
||||
/// Formats:
|
||||
///
|
||||
/// ```text
|
||||
/// <oid>
|
||||
/// <oid>_<fork name>
|
||||
/// <oid>.<segment number>
|
||||
/// <oid>_<fork name>.<segment number>
|
||||
/// ```
|
||||
///
|
||||
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
|
||||
///
|
||||
|
||||
@@ -52,6 +52,7 @@ impl Conf {
|
||||
match self.pg_version {
|
||||
14 => Ok(path.join(format!("v{}", self.pg_version))),
|
||||
15 => Ok(path.join(format!("v{}", self.pg_version))),
|
||||
16 => Ok(path.join(format!("v{}", self.pg_version))),
|
||||
_ => bail!("Unsupported postgres version: {}", self.pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,11 +5,11 @@
|
||||
//! It is similar to what tokio_util::codec::Framed with appropriate codec
|
||||
//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
|
||||
//! separately without using split from futures::stream::StreamExt (which
|
||||
//! allocates box[1] in polling internally). tokio::io::split is used for splitting
|
||||
//! allocates a [Box] in polling internally). tokio::io::split is used for splitting
|
||||
//! instead. Plus we customize error messages more than a single type for all io
|
||||
//! calls.
|
||||
//!
|
||||
//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
|
||||
//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
|
||||
use bytes::{Buf, BytesMut};
|
||||
use std::{
|
||||
future::Future,
|
||||
@@ -117,7 +117,7 @@ impl<S: AsyncWrite + Unpin> Framed<S> {
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
|
||||
/// Split into owned read and write parts. Beware of potential issues with
|
||||
/// using halves in different tasks on TLS stream:
|
||||
/// https://github.com/tokio-rs/tls/issues/40
|
||||
/// <https://github.com/tokio-rs/tls/issues/40>
|
||||
pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
|
||||
let (read_half, write_half) = tokio::io::split(self.stream);
|
||||
let reader = FramedReader {
|
||||
|
||||
@@ -934,6 +934,15 @@ impl<'a> BeMessage<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
|
||||
let mut terminated = [0; 6];
|
||||
for (i, &elem) in code.iter().enumerate() {
|
||||
terminated[i] = elem;
|
||||
}
|
||||
|
||||
terminated
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -965,12 +974,3 @@ mod tests {
|
||||
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
||||
}
|
||||
}
|
||||
|
||||
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
|
||||
let mut terminated = [0; 6];
|
||||
for (i, &elem) in code.iter().enumerate() {
|
||||
terminated[i] = elem;
|
||||
}
|
||||
|
||||
terminated
|
||||
}
|
||||
|
||||
@@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
|
||||
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||
/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
|
||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
@@ -50,6 +50,12 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct RemotePath(PathBuf);
|
||||
|
||||
impl std::fmt::Display for RemotePath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0.display())
|
||||
}
|
||||
}
|
||||
|
||||
impl RemotePath {
|
||||
pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
|
||||
anyhow::ensure!(
|
||||
@@ -184,20 +190,6 @@ pub enum GenericRemoteStorage {
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
// A function for listing all the files in a "directory"
|
||||
// Example:
|
||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder).await,
|
||||
Self::AwsS3(s) => s.list_files(folder).await,
|
||||
Self::Unreliable(s) => s.list_files(folder).await,
|
||||
}
|
||||
}
|
||||
|
||||
// lists common *prefixes*, if any of files
|
||||
// Example:
|
||||
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
|
||||
pub async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -209,6 +201,14 @@ impl GenericRemoteStorage {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder).await,
|
||||
Self::AwsS3(s) => s.list_files(folder).await,
|
||||
Self::Unreliable(s) => s.list_files(folder).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
future::Future,
|
||||
io::ErrorKind,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
};
|
||||
@@ -150,10 +151,7 @@ impl RemoteStorage for LocalFs {
|
||||
let mut files = vec![];
|
||||
let mut directory_queue = vec![full_path.clone()];
|
||||
|
||||
while !directory_queue.is_empty() {
|
||||
let cur_folder = directory_queue
|
||||
.pop()
|
||||
.expect("queue cannot be empty: we just checked");
|
||||
while let Some(cur_folder) = directory_queue.pop() {
|
||||
let mut entries = fs::read_dir(cur_folder.clone()).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let file_name: PathBuf = entry.file_name().into();
|
||||
@@ -343,18 +341,14 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let file_path = path.with_base(&self.storage_root);
|
||||
if !file_path.exists() {
|
||||
match fs::remove_file(&file_path).await {
|
||||
Ok(()) => Ok(()),
|
||||
// The file doesn't exist. This shouldn't yield an error to mirror S3's behaviour.
|
||||
// See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
|
||||
// > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
|
||||
return Ok(());
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => Ok(()),
|
||||
Err(e) => Err(anyhow::anyhow!(e)),
|
||||
}
|
||||
|
||||
if !file_path.is_file() {
|
||||
anyhow::bail!("{file_path:?} is not a file");
|
||||
}
|
||||
Ok(fs::remove_file(file_path)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!(e))?)
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
|
||||
@@ -349,17 +349,10 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
/// See the doc for `RemoteStorage::list_files`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let mut folder_name = folder
|
||||
let folder_name = folder
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
// remove leading "/" if one exists
|
||||
if let Some(folder_name_slash) = folder_name.clone() {
|
||||
if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
folder_name = Some(folder_name_slash[1..].to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// AWS may need to break the response into several parts
|
||||
let mut continuation_token = None;
|
||||
let mut all_files = vec![];
|
||||
|
||||
@@ -173,10 +173,15 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
|
||||
let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data1 = "remote blob data1".as_bytes();
|
||||
let data1_len = data1.len();
|
||||
let data2 = "remote blob data2".as_bytes();
|
||||
let data2_len = data2.len();
|
||||
let data3 = "remote blob data3".as_bytes();
|
||||
let data3_len = data3.len();
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
||||
.await?;
|
||||
@@ -185,8 +190,18 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
|
||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
||||
.await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
|
||||
.await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
let prefixes = ctx.client.list_prefixes(None).await?;
|
||||
|
||||
assert_eq!(prefixes.len(), 1);
|
||||
|
||||
ctx.client.delete_objects(&[path3]).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
/// `Segment` which has had its size calculated.
|
||||
#[derive(Clone, Debug)]
|
||||
struct SegmentSize {
|
||||
method: SegmentMethod,
|
||||
|
||||
@@ -33,7 +33,7 @@ pub enum OtelName<'a> {
|
||||
/// directly into HTTP servers. However, I couldn't find one for Hyper,
|
||||
/// so I had to write our own. OpenTelemetry website has a registry of
|
||||
/// instrumentation libraries at:
|
||||
/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
|
||||
/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
|
||||
/// If a Hyper crate appears, consider switching to that.
|
||||
pub async fn tracing_handler<F, R>(
|
||||
req: Request<Body>,
|
||||
|
||||
@@ -5,7 +5,6 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
atty.workspace = true
|
||||
sentry.workspace = true
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
@@ -41,6 +40,8 @@ pq_proto.workspace = true
|
||||
metrics.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
const_format.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
|
||||
@@ -14,7 +14,7 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
|
||||
/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
|
||||
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
||||
request: &mut Request<Body>,
|
||||
) -> Result<Option<T>, ApiError> {
|
||||
|
||||
@@ -109,10 +109,16 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
|
||||
/// * building in docker (either in CI or locally)
|
||||
///
|
||||
/// One thing to note is that .git is not available in docker (and it is bad to include it there).
|
||||
/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
|
||||
/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
|
||||
/// Git version received from environment variable used as a fallback in git_version invocation.
|
||||
/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
|
||||
/// When building locally, the `git_version` is used to query .git. When building on CI and docker,
|
||||
/// we don't build the actual PR branch commits, but always a "phantom" would be merge commit to
|
||||
/// the target branch -- the actual PR commit from which we build from is supplied as GIT_VERSION
|
||||
/// environment variable.
|
||||
///
|
||||
/// We ended up with this compromise between phantom would be merge commits vs. pull request branch
|
||||
/// heads due to old logs becoming more reliable (github could gc the phantom merge commit
|
||||
/// anytime) in #4641.
|
||||
///
|
||||
/// To avoid running buildscript every recompilation, we use rerun-if-env-changed option.
|
||||
/// So the build script will be run only when GIT_VERSION envvar has changed.
|
||||
///
|
||||
/// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
|
||||
@@ -124,25 +130,36 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
|
||||
/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
|
||||
///
|
||||
/// #############################################################################################
|
||||
/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
|
||||
/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
|
||||
/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
|
||||
/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
|
||||
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
|
||||
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
|
||||
/// The problem needs further investigation and regular `const` declaration instead of a macro.
|
||||
#[macro_export]
|
||||
macro_rules! project_git_version {
|
||||
($const_identifier:ident) => {
|
||||
const $const_identifier: &str = git_version::git_version!(
|
||||
prefix = "git:",
|
||||
fallback = concat!(
|
||||
"git-env:",
|
||||
env!("GIT_VERSION", "Missing GIT_VERSION envvar")
|
||||
),
|
||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||
);
|
||||
// this should try GIT_VERSION first only then git_version::git_version!
|
||||
const $const_identifier: &::core::primitive::str = {
|
||||
const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! {
|
||||
prefix = "",
|
||||
fallback = "unknown",
|
||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||
};
|
||||
|
||||
const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("GIT_VERSION") {
|
||||
::core::option::Option::Some(x) => ["git-env:", x],
|
||||
::core::option::Option::None => ["git:", __COMMIT_FROM_GIT],
|
||||
};
|
||||
|
||||
$crate::__const_format::concatcp!(__ARG[0], __ARG[1])
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
/// Re-export for `project_git_version` macro
|
||||
#[doc(hidden)]
|
||||
pub use const_format as __const_format;
|
||||
|
||||
/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
|
||||
#[macro_export]
|
||||
macro_rules! const_assert {
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
//! A module to create and read lock files.
|
||||
//!
|
||||
//! File locking is done using [`fcntl::flock`] exclusive locks.
|
||||
//! The only consumer of this module is currently [`pid_file`].
|
||||
//! See the module-level comment there for potential pitfalls
|
||||
//! with lock files that are used to store PIDs (pidfiles).
|
||||
//! The only consumer of this module is currently
|
||||
//! [`pid_file`](crate::pid_file). See the module-level comment
|
||||
//! there for potential pitfalls with lock files that are used
|
||||
//! to store PIDs (pidfiles).
|
||||
|
||||
use std::{
|
||||
fs,
|
||||
@@ -81,7 +82,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFi
|
||||
}
|
||||
|
||||
/// Returned by [`read_and_hold_lock_file`].
|
||||
/// Check out the [`pid_file`] module for what the variants mean
|
||||
/// Check out the [`pid_file`](crate::pid_file) module for what the variants mean
|
||||
/// and potential caveats if the lock files that are used to store PIDs.
|
||||
pub enum LockFileRead {
|
||||
/// No file exists at the given path.
|
||||
|
||||
@@ -84,7 +84,7 @@ pub fn init(
|
||||
let r = r.with({
|
||||
let log_layer = tracing_subscriber::fmt::layer()
|
||||
.with_target(false)
|
||||
.with_ansi(atty::is(atty::Stream::Stdout))
|
||||
.with_ansi(false)
|
||||
.with_writer(std::io::stdout);
|
||||
let log_layer = match log_format {
|
||||
LogFormat::Json => log_layer.json().boxed(),
|
||||
@@ -112,7 +112,7 @@ pub fn init(
|
||||
///
|
||||
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
|
||||
/// If the assumptions about the initialization order are not held, use
|
||||
/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
|
||||
/// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
|
||||
/// lost.
|
||||
#[must_use]
|
||||
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use pin_project_lite::pin_project;
|
||||
use std::io::Read;
|
||||
use std::pin::Pin;
|
||||
use std::{io, task};
|
||||
use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
|
||||
@@ -75,3 +76,34 @@ impl<S: AsyncWrite + Unpin, R, W: FnMut(usize)> AsyncWrite for MeasuredStream<S,
|
||||
self.project().stream.poll_shutdown(context)
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for a reader that counts bytes read.
|
||||
///
|
||||
/// Similar to MeasuredStream but it's one way and it's sync
|
||||
pub struct MeasuredReader<R: Read> {
|
||||
inner: R,
|
||||
byte_count: usize,
|
||||
}
|
||||
|
||||
impl<R: Read> MeasuredReader<R> {
|
||||
pub fn new(reader: R) -> Self {
|
||||
Self {
|
||||
inner: reader,
|
||||
byte_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_byte_count(&self) -> usize {
|
||||
self.byte_count
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read> Read for MeasuredReader<R> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
let result = self.inner.read(buf);
|
||||
if let Ok(n_bytes) = result {
|
||||
self.byte_count += n_bytes
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,9 +23,9 @@ pub enum SeqWaitError {
|
||||
|
||||
/// Monotonically increasing value
|
||||
///
|
||||
/// It is handy to store some other fields under the same mutex in SeqWait<S>
|
||||
/// It is handy to store some other fields under the same mutex in `SeqWait<S>`
|
||||
/// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
|
||||
/// any type that can expose counter. <V> is the type of exposed counter.
|
||||
/// any type that can expose counter. `V` is the type of exposed counter.
|
||||
pub trait MonotonicCounter<V> {
|
||||
/// Bump counter value and check that it goes forward
|
||||
/// N.B.: new_val is an actual new value, not a difference.
|
||||
@@ -90,7 +90,7 @@ impl<T: Ord> Eq for Waiter<T> {}
|
||||
/// [`wait_for`]: SeqWait::wait_for
|
||||
/// [`advance`]: SeqWait::advance
|
||||
///
|
||||
/// <S> means Storage, <V> is type of counter that this storage exposes.
|
||||
/// `S` means Storage, `V` is type of counter that this storage exposes.
|
||||
///
|
||||
pub struct SeqWait<S, V>
|
||||
where
|
||||
|
||||
@@ -1,8 +1,15 @@
|
||||
//! Assert that the current [`tracing::Span`] has a given set of fields.
|
||||
//!
|
||||
//! Can only produce meaningful positive results when tracing has been configured as in example.
|
||||
//! Absence of `tracing_error::ErrorLayer` is not detected yet.
|
||||
//!
|
||||
//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing
|
||||
//! is completly unconfigured.
|
||||
//!
|
||||
//! # Usage
|
||||
//!
|
||||
//! ```
|
||||
//! ```rust
|
||||
//! # fn main() {
|
||||
//! use tracing_subscriber::prelude::*;
|
||||
//! let registry = tracing_subscriber::registry()
|
||||
//! .with(tracing_error::ErrorLayer::default());
|
||||
@@ -20,23 +27,18 @@
|
||||
//!
|
||||
//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
||||
//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
|
||||
//! match check_fields_present([&extractor]) {
|
||||
//! Ok(()) => {},
|
||||
//! Err(missing) => {
|
||||
//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
|
||||
//! }
|
||||
//! if let Err(missing) = check_fields_present!([&extractor]) {
|
||||
//! // if you copypaste this to a custom assert method, remember to add #[track_caller]
|
||||
//! // to get the "user" code location for the panic.
|
||||
//! panic!("Missing fields: {missing:?}");
|
||||
//! }
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
|
||||
//! Recommended reading: <https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering>
|
||||
//!
|
||||
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
fmt::{self},
|
||||
hash::{Hash, Hasher},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ExtractionResult {
|
||||
Present,
|
||||
Absent,
|
||||
@@ -71,51 +73,105 @@ impl<const L: usize> Extractor for MultiNameExtractor<L> {
|
||||
}
|
||||
}
|
||||
|
||||
struct MemoryIdentity<'a>(&'a dyn Extractor);
|
||||
|
||||
impl<'a> MemoryIdentity<'a> {
|
||||
fn as_ptr(&self) -> *const () {
|
||||
self.0 as *const _ as *const ()
|
||||
}
|
||||
}
|
||||
impl<'a> PartialEq for MemoryIdentity<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_ptr() == other.as_ptr()
|
||||
}
|
||||
}
|
||||
impl<'a> Eq for MemoryIdentity<'a> {}
|
||||
impl<'a> Hash for MemoryIdentity<'a> {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.as_ptr().hash(state);
|
||||
}
|
||||
}
|
||||
impl<'a> fmt::Debug for MemoryIdentity<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
|
||||
}
|
||||
}
|
||||
|
||||
/// The extractor names passed as keys to [`new`].
|
||||
pub fn check_fields_present<const L: usize>(
|
||||
/// Checks that the given extractors are satisfied with the current span hierarchy.
|
||||
///
|
||||
/// This should not be called directly, but used through [`check_fields_present`] which allows
|
||||
/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default.
|
||||
#[doc(hidden)]
|
||||
pub fn check_fields_present0<const L: usize>(
|
||||
must_be_present: [&dyn Extractor; L],
|
||||
) -> Result<(), Vec<&dyn Extractor>> {
|
||||
let mut missing: HashSet<MemoryIdentity> =
|
||||
HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
|
||||
) -> Result<Summary, Vec<&dyn Extractor>> {
|
||||
let mut missing = must_be_present.into_iter().collect::<Vec<_>>();
|
||||
let trace = tracing_error::SpanTrace::capture();
|
||||
trace.with_spans(|md, _formatted_fields| {
|
||||
missing.retain(|extractor| match extractor.0.extract(md.fields()) {
|
||||
// when trying to understand the inner workings of how does the matching work, note that
|
||||
// this closure might be called zero times if the span is disabled. normally it is called
|
||||
// once per span hierarchy level.
|
||||
missing.retain(|extractor| match extractor.extract(md.fields()) {
|
||||
ExtractionResult::Present => false,
|
||||
ExtractionResult::Absent => true,
|
||||
});
|
||||
!missing.is_empty() // continue walking up until we've found all missing
|
||||
|
||||
// continue walking up until we've found all missing
|
||||
!missing.is_empty()
|
||||
});
|
||||
if missing.is_empty() {
|
||||
Ok(())
|
||||
Ok(Summary::FoundEverything)
|
||||
} else if !tracing_subscriber_configured() {
|
||||
Ok(Summary::Unconfigured)
|
||||
} else {
|
||||
Err(missing.into_iter().map(|mi| mi.0).collect())
|
||||
// we can still hit here if a tracing subscriber has been configured but the ErrorLayer is
|
||||
// missing, which can be annoying. for this case, we could probably use
|
||||
// SpanTrace::status().
|
||||
//
|
||||
// another way to end up here is with RUST_LOG=pageserver=off while configuring the
|
||||
// logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid.
|
||||
// this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`.
|
||||
Err(missing)
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks that the given extractors are satisfied with the current span hierarchy.
|
||||
///
|
||||
/// The macro is the preferred way of checking if fields exist while passing checks if a test does
|
||||
/// not have tracing configured.
|
||||
///
|
||||
/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present.
|
||||
/// However we can game a module namespaced macro for `use` purposes by re-exporting the
|
||||
/// #[macro_export] exported name with an alias (below).
|
||||
#[doc(hidden)]
|
||||
#[macro_export]
|
||||
macro_rules! __check_fields_present {
|
||||
($extractors:expr) => {{
|
||||
{
|
||||
use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor};
|
||||
|
||||
match check_fields_present0($extractors) {
|
||||
Ok(FoundEverything) => Ok(()),
|
||||
Ok(Unconfigured) if cfg!(test) => {
|
||||
// allow unconfigured in tests
|
||||
Ok(())
|
||||
},
|
||||
Ok(Unconfigured) => {
|
||||
panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
|
||||
},
|
||||
Err(missing) => Err(missing)
|
||||
}
|
||||
}
|
||||
}}
|
||||
}
|
||||
|
||||
pub use crate::__check_fields_present as check_fields_present;
|
||||
|
||||
/// Explanation for why the check was deemed ok.
|
||||
///
|
||||
/// Mainly useful for testing, or configuring per-crate behaviour as in with
|
||||
/// [`check_fields_present`].
|
||||
#[derive(Debug)]
|
||||
pub enum Summary {
|
||||
/// All extractors were found.
|
||||
///
|
||||
/// Should only happen when tracing is properly configured.
|
||||
FoundEverything,
|
||||
|
||||
/// Tracing has not been configured at all. This is ok for tests running without tracing set
|
||||
/// up.
|
||||
Unconfigured,
|
||||
}
|
||||
|
||||
fn tracing_subscriber_configured() -> bool {
|
||||
let mut noop_configured = false;
|
||||
tracing::dispatcher::get_default(|d| {
|
||||
// it is possible that this closure will not be invoked, but the current implementation
|
||||
// always invokes it
|
||||
noop_configured = d
|
||||
.downcast_ref::<tracing::subscriber::NoSubscriber>()
|
||||
.is_some();
|
||||
});
|
||||
|
||||
!noop_configured
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -123,6 +179,36 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
fmt::{self},
|
||||
hash::{Hash, Hasher},
|
||||
};
|
||||
|
||||
struct MemoryIdentity<'a>(&'a dyn Extractor);
|
||||
|
||||
impl<'a> MemoryIdentity<'a> {
|
||||
fn as_ptr(&self) -> *const () {
|
||||
self.0 as *const _ as *const ()
|
||||
}
|
||||
}
|
||||
impl<'a> PartialEq for MemoryIdentity<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_ptr() == other.as_ptr()
|
||||
}
|
||||
}
|
||||
impl<'a> Eq for MemoryIdentity<'a> {}
|
||||
impl<'a> Hash for MemoryIdentity<'a> {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.as_ptr().hash(state);
|
||||
}
|
||||
}
|
||||
impl<'a> fmt::Debug for MemoryIdentity<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
|
||||
}
|
||||
}
|
||||
|
||||
struct Setup {
|
||||
_current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
|
||||
tenant_extractor: MultiNameExtractor<2>,
|
||||
@@ -159,7 +245,8 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
|
||||
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -167,8 +254,8 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
let missing =
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
|
||||
let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor])
|
||||
.unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
@@ -185,7 +272,8 @@ mod tests {
|
||||
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
|
||||
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -198,7 +286,7 @@ mod tests {
|
||||
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
@@ -207,7 +295,8 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
check_fields_present([&setup.tenant_extractor]).unwrap();
|
||||
let res = check_fields_present0([&setup.tenant_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -223,7 +312,8 @@ mod tests {
|
||||
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
check_fields_present([&setup.tenant_extractor]).unwrap();
|
||||
let res = check_fields_present0([&setup.tenant_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -231,7 +321,7 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
@@ -245,43 +335,107 @@ mod tests {
|
||||
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tracing_error_subscriber_not_set_up() {
|
||||
fn tracing_error_subscriber_not_set_up_straight_line() {
|
||||
// no setup
|
||||
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
||||
let missing = check_fields_present([&extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&extractor]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
|
||||
// similarly for a not found key
|
||||
let extractor = MultiNameExtractor::new("F", ["foobar"]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn panics_if_tracing_error_subscriber_has_wrong_filter() {
|
||||
fn tracing_error_subscriber_not_set_up_with_instrument() {
|
||||
// no setup
|
||||
|
||||
// demo a case where span entering is used to establish a parent child connection, but
|
||||
// when we re-enter the subspan SpanTrace::with_spans iterates over nothing.
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let subspan = tracing::info_span!("bar", f = "foobar");
|
||||
drop(_guard);
|
||||
|
||||
// normally this would work, but without any tracing-subscriber configured, both
|
||||
// check_field_present find nothing
|
||||
let _guard = subspan.enter();
|
||||
let extractors: [&dyn Extractor; 2] = [
|
||||
&MultiNameExtractor::new("E", ["e"]),
|
||||
&MultiNameExtractor::new("F", ["f"]),
|
||||
];
|
||||
|
||||
let res = check_fields_present0(extractors);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
|
||||
// similarly for a not found key
|
||||
let extractor = MultiNameExtractor::new("G", ["g"]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tracing_subscriber_configured() {
|
||||
// this will fail if any utils::logging::init callers appear, but let's hope they do not
|
||||
// appear.
|
||||
assert!(!super::tracing_subscriber_configured());
|
||||
|
||||
let _g = setup_current_thread();
|
||||
|
||||
assert!(super::tracing_subscriber_configured());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_found_when_disabled_by_filter() {
|
||||
let r = tracing_subscriber::registry().with({
|
||||
tracing_error::ErrorLayer::default().with_filter(
|
||||
tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
|
||||
if md.is_span() && *md.level() == tracing::Level::INFO {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}),
|
||||
)
|
||||
tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn(
|
||||
|md| !(md.is_span() && *md.level() == tracing::Level::INFO),
|
||||
))
|
||||
});
|
||||
|
||||
let _guard = tracing::subscriber::set_default(r);
|
||||
|
||||
// this test is a rather tricky one, it has a number of possible outcomes depending on the
|
||||
// execution order when executed with other tests even if no test sets the global default
|
||||
// subscriber.
|
||||
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
||||
let missing = check_fields_present([&extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&extractor]);
|
||||
let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
|
||||
|
||||
if span.is_disabled() {
|
||||
// the tests are running single threaded, or we got lucky and no other tests subscriber
|
||||
// was got to register their per-CALLSITE::META interest between `set_default` and
|
||||
// creation of the span, thus the filter got to apply and registered interest of Never,
|
||||
// so the span was never created.
|
||||
//
|
||||
// as the span is disabled, no keys were recorded to it, leading check_fields_present0
|
||||
// to find an error.
|
||||
|
||||
let missing = check_fields_present0(extractors).unwrap_err();
|
||||
assert_missing(missing, vec![extractors[0]]);
|
||||
} else {
|
||||
// when the span is enabled, it is because some other test is running at the same time,
|
||||
// and that tests registry has filters which are interested in our above span.
|
||||
//
|
||||
// because the span is now enabled, all keys will be found for it. the
|
||||
// tracing_error::SpanTrace does not consider layer filters during the span hierarchy
|
||||
// walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in
|
||||
// this test-induced issue.
|
||||
|
||||
let res = check_fields_present0(extractors);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
@@ -24,6 +25,7 @@ consumption_metrics.workspace = true
|
||||
crc32c.workspace = true
|
||||
crossbeam-utils.workspace = true
|
||||
either.workspace = true
|
||||
flate2.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
|
||||
@@ -1,22 +1,23 @@
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
|
||||
use pageserver::tenant::storage_layer::LayerFileName;
|
||||
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use std::cmp::{max, min};
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
|
||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
||||
let mut layer_map = LayerMap::<LayerDescriptor>::default();
|
||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
||||
let mut layer_map = LayerMap::default();
|
||||
|
||||
let mut min_lsn = Lsn(u64::MAX);
|
||||
let mut max_lsn = Lsn(0);
|
||||
@@ -27,13 +28,13 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
||||
for fname in filenames {
|
||||
let fname = fname.unwrap();
|
||||
let fname = LayerFileName::from_str(&fname).unwrap();
|
||||
let layer = LayerDescriptor::from(fname);
|
||||
let layer = PersistentLayerDesc::from(fname);
|
||||
|
||||
let lsn_range = layer.get_lsn_range();
|
||||
min_lsn = min(min_lsn, lsn_range.start);
|
||||
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
|
||||
updates.insert_historic(layer);
|
||||
}
|
||||
|
||||
println!("min: {min_lsn}, max: {max_lsn}");
|
||||
@@ -43,7 +44,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
||||
}
|
||||
|
||||
/// Construct a layer map query pattern for benchmarks
|
||||
fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
|
||||
fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
|
||||
// For each image layer we query one of the pages contained, at LSN right
|
||||
// before the image layer was created. This gives us a somewhat uniform
|
||||
// coverage of both the lsn and key space because image layers have
|
||||
@@ -69,7 +70,7 @@ fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn
|
||||
|
||||
// Construct a partitioning for testing get_difficulty map when we
|
||||
// don't have an exact result of `collect_keyspace` to work with.
|
||||
fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
|
||||
fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
|
||||
let mut parts = Vec::new();
|
||||
|
||||
// We add a partition boundary at the start of each image layer,
|
||||
@@ -209,13 +210,15 @@ fn bench_sequential(c: &mut Criterion) {
|
||||
for i in 0..100_000 {
|
||||
let i32 = (i as u32) % 100;
|
||||
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||
let layer = LayerDescriptor {
|
||||
key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
lsn: Lsn(i)..Lsn(i + 1),
|
||||
is_incremental: false,
|
||||
short_id: format!("Layer {}", i),
|
||||
};
|
||||
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
|
||||
let layer = PersistentLayerDesc::new_img(
|
||||
TenantId::generate(),
|
||||
TimelineId::generate(),
|
||||
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
Lsn(i),
|
||||
false,
|
||||
0,
|
||||
);
|
||||
updates.insert_historic(layer);
|
||||
}
|
||||
updates.flush();
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
@@ -7,10 +7,10 @@
|
||||
//! - The y axis represents LSN, growing upwards.
|
||||
//!
|
||||
//! Coordinates in both axis are compressed for better readability.
|
||||
//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
|
||||
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
|
||||
//!
|
||||
//! Example use:
|
||||
//! ```
|
||||
//! ```bash
|
||||
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||
//! $ firefox out.svg
|
||||
@@ -20,7 +20,7 @@
|
||||
//! or from pageserver log files.
|
||||
//!
|
||||
//! TODO Consider shipping this as a grafana panel plugin:
|
||||
//! https://grafana.com/tutorials/build-a-panel-plugin/
|
||||
//! <https://grafana.com/tutorials/build-a-panel-plugin/>
|
||||
use anyhow::Result;
|
||||
use pageserver::repository::Key;
|
||||
use std::cmp::Ordering;
|
||||
@@ -117,7 +117,8 @@ pub fn main() -> Result<()> {
|
||||
|
||||
let mut lsn_diff = (lsn_end - lsn_start) as f32;
|
||||
let mut fill = Fill::None;
|
||||
let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
|
||||
let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
|
||||
let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
|
||||
let mut lsn_offset = 0.0;
|
||||
|
||||
// Fill in and thicken rectangle if it's an
|
||||
@@ -128,7 +129,7 @@ pub fn main() -> Result<()> {
|
||||
num_images += 1;
|
||||
lsn_diff = 0.3;
|
||||
lsn_offset = -lsn_diff / 2.0;
|
||||
margin = 0.05;
|
||||
ymargin = 0.05;
|
||||
fill = Fill::Color(rgb(0, 0, 0));
|
||||
}
|
||||
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
|
||||
@@ -137,10 +138,10 @@ pub fn main() -> Result<()> {
|
||||
println!(
|
||||
" {}",
|
||||
rectangle(
|
||||
key_start as f32 + stretch * margin,
|
||||
stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
|
||||
key_diff as f32 - stretch * 2.0 * margin,
|
||||
stretch * (lsn_diff - 2.0 * margin)
|
||||
key_start as f32 + stretch * xmargin,
|
||||
stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
|
||||
key_diff as f32 - stretch * 2.0 * xmargin,
|
||||
stretch * (lsn_diff - 2.0 * ymargin)
|
||||
)
|
||||
.fill(fill)
|
||||
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
|
||||
|
||||
@@ -171,11 +171,13 @@ pub struct PageServerConf {
|
||||
|
||||
pub log_format: LogFormat,
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
|
||||
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
|
||||
/// See the comment in `eviction_task` for details.
|
||||
///
|
||||
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
|
||||
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
|
||||
|
||||
// How often to collect metrics and send them to the metrics endpoint.
|
||||
@@ -570,21 +572,21 @@ impl PageServerConf {
|
||||
.join(TENANT_ATTACHING_MARKER_FILENAME)
|
||||
}
|
||||
|
||||
pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
|
||||
self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
||||
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
|
||||
self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME)
|
||||
pub fn tenant_config_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
|
||||
}
|
||||
|
||||
pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf {
|
||||
pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
|
||||
self.timelines_path(tenant_id).join(timeline_id.to_string())
|
||||
}
|
||||
|
||||
@@ -594,7 +596,7 @@ impl PageServerConf {
|
||||
timeline_id: TimelineId,
|
||||
) -> PathBuf {
|
||||
path_with_suffix_extension(
|
||||
self.timeline_path(&timeline_id, &tenant_id),
|
||||
self.timeline_path(&tenant_id, &timeline_id),
|
||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
)
|
||||
}
|
||||
@@ -617,8 +619,8 @@ impl PageServerConf {
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
|
||||
self.timeline_path(&timeline_id, &tenant_id)
|
||||
pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
|
||||
self.timeline_path(tenant_id, timeline_id)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
@@ -653,6 +655,7 @@ impl PageServerConf {
|
||||
match pg_version {
|
||||
14 => Ok(path.join(format!("v{pg_version}"))),
|
||||
15 => Ok(path.join(format!("v{pg_version}"))),
|
||||
16 => Ok(path.join(format!("v{pg_version}"))),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -661,6 +664,7 @@ impl PageServerConf {
|
||||
match pg_version {
|
||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
16 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -668,6 +672,7 @@ impl PageServerConf {
|
||||
match pg_version {
|
||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
16 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -993,6 +998,8 @@ impl ConfigurableSemaphore {
|
||||
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
|
||||
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
|
||||
/// behave like [`futures::future::pending`], just waiting until new permits are added.
|
||||
///
|
||||
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
|
||||
pub fn new(initial_permits: NonZeroUsize) -> Self {
|
||||
ConfigurableSemaphore {
|
||||
initial_permits,
|
||||
|
||||
@@ -24,6 +24,8 @@ const RESIDENT_SIZE: &str = "resident_size";
|
||||
const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
|
||||
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
|
||||
|
||||
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Debug)]
|
||||
struct Ids {
|
||||
@@ -73,7 +75,10 @@ pub async fn collect_metrics(
|
||||
);
|
||||
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
|
||||
.build()
|
||||
.expect("Failed to create http client with timeout");
|
||||
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
|
||||
let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
|
||||
|
||||
@@ -83,7 +88,7 @@ pub async fn collect_metrics(
|
||||
info!("collect_metrics received cancellation request");
|
||||
return Ok(());
|
||||
},
|
||||
_ = ticker.tick() => {
|
||||
tick_at = ticker.tick() => {
|
||||
|
||||
// send cached metrics every cached_metric_collection_interval
|
||||
let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
|
||||
@@ -93,6 +98,12 @@ pub async fn collect_metrics(
|
||||
}
|
||||
|
||||
collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
|
||||
|
||||
crate::tenant::tasks::warn_when_period_overrun(
|
||||
tick_at.elapsed(),
|
||||
metric_collection_interval,
|
||||
"consumption_metrics_collect_metrics",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -223,14 +234,18 @@ pub async fn collect_metrics_iteration(
|
||||
// Note that this metric is calculated in a separate bgworker
|
||||
// Here we only use cached value, which may lag behind the real latest one
|
||||
let tenant_synthetic_size = tenant.get_cached_synthetic_size();
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: SYNTHETIC_STORAGE_SIZE,
|
||||
},
|
||||
tenant_synthetic_size,
|
||||
));
|
||||
|
||||
if tenant_synthetic_size != 0 {
|
||||
// only send non-zeroes because otherwise these show up as errors in logs
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: SYNTHETIC_STORAGE_SIZE,
|
||||
},
|
||||
tenant_synthetic_size,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Filter metrics, unless we want to send all metrics, including cached ones.
|
||||
@@ -273,31 +288,42 @@ pub async fn collect_metrics_iteration(
|
||||
})
|
||||
.expect("PageserverConsumptionMetric should not fail serialization");
|
||||
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.json(&chunk_json)
|
||||
.send()
|
||||
.await;
|
||||
const MAX_RETRIES: u32 = 3;
|
||||
|
||||
match res {
|
||||
Ok(res) => {
|
||||
if res.status().is_success() {
|
||||
// update cached metrics after they were sent successfully
|
||||
for (curr_key, curr_val) in chunk.iter() {
|
||||
cached_metrics.insert(curr_key.clone(), *curr_val);
|
||||
}
|
||||
} else {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
for metric in chunk_to_send.iter() {
|
||||
// Report if the metric value is suspiciously large
|
||||
if metric.value > (1u64 << 40) {
|
||||
for attempt in 0..MAX_RETRIES {
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.json(&chunk_json)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(res) => {
|
||||
if res.status().is_success() {
|
||||
// update cached metrics after they were sent successfully
|
||||
for (curr_key, curr_val) in chunk.iter() {
|
||||
cached_metrics.insert(curr_key.clone(), *curr_val);
|
||||
}
|
||||
} else {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
for metric in chunk_to_send
|
||||
.iter()
|
||||
.filter(|metric| metric.value > (1u64 << 40))
|
||||
{
|
||||
// Report if the metric value is suspiciously large
|
||||
error!("potentially abnormal metric value: {:?}", metric);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
Err(err) if err.is_timeout() => {
|
||||
error!(attempt, "timeout sending metrics, retrying immediately");
|
||||
continue;
|
||||
}
|
||||
Err(err) => {
|
||||
error!(attempt, ?err, "failed to send metrics");
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
error!("failed to send metrics: {:?}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -317,7 +343,7 @@ pub async fn calculate_synthetic_size_worker(
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
return Ok(());
|
||||
},
|
||||
_ = ticker.tick() => {
|
||||
tick_at = ticker.tick() => {
|
||||
|
||||
let tenants = match mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
@@ -343,6 +369,12 @@ pub async fn calculate_synthetic_size_worker(
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
crate::tenant::tasks::warn_when_period_overrun(
|
||||
tick_at.elapsed(),
|
||||
synthetic_size_calculation_interval,
|
||||
"consumption_metrics_synthetic_size_worker",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,6 +179,9 @@ impl RequestContext {
|
||||
/// a context and you are unwilling to change all callers to provide one.
|
||||
///
|
||||
/// Before we add cancellation, we should get rid of this method.
|
||||
///
|
||||
/// [`attached_child`]: Self::attached_child
|
||||
/// [`detached_child`]: Self::detached_child
|
||||
pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
Self::new(task_kind, download_behavior)
|
||||
}
|
||||
|
||||
@@ -110,7 +110,6 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
|
||||
disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
|
||||
.await;
|
||||
info!("disk usage based eviction task finishing");
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
@@ -126,13 +125,16 @@ async fn disk_usage_eviction_task(
|
||||
tenants_dir: &Path,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
scopeguard::defer! {
|
||||
info!("disk usage based eviction task finishing");
|
||||
};
|
||||
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
{
|
||||
if random_init_delay(task_config.period, &cancel)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
info!("shutting down");
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -167,7 +169,6 @@ async fn disk_usage_eviction_task(
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep_until(sleep_until) => {},
|
||||
_ = cancel.cancelled() => {
|
||||
info!("shutting down");
|
||||
break
|
||||
}
|
||||
}
|
||||
@@ -304,7 +305,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
let now = SystemTime::now();
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
debug!(
|
||||
"cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}",
|
||||
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
|
||||
i + 1,
|
||||
candidates.len(),
|
||||
candidate.layer.file_size(),
|
||||
@@ -314,7 +315,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
partition,
|
||||
candidate.layer.get_tenant_id(),
|
||||
candidate.layer.get_timeline_id(),
|
||||
candidate.layer.filename().file_name(),
|
||||
candidate.layer,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -186,10 +186,8 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
delete:
|
||||
description: "Attempts to delete specified timeline. On 500 errors should be retried"
|
||||
description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
|
||||
responses:
|
||||
"200":
|
||||
description: Ok
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
@@ -214,6 +212,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: Deletion is already in progress, continue polling
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"412":
|
||||
description: Tenant is missing, or timeline has children
|
||||
content:
|
||||
@@ -718,6 +722,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"406":
|
||||
description: Permanently unsatisfiable request, don't retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"409":
|
||||
description: Timeline already exists, creation skipped
|
||||
content:
|
||||
|
||||
@@ -23,7 +23,6 @@ use super::models::{
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::disk_usage_eviction_task;
|
||||
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
@@ -35,6 +34,7 @@ use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
http::{
|
||||
@@ -187,6 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
format!("Cannot delete timeline which has child timelines: {children:?}")
|
||||
.into_boxed_str(),
|
||||
),
|
||||
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
|
||||
Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
@@ -327,18 +328,25 @@ async fn timeline_create_handler(
|
||||
&ctx,
|
||||
)
|
||||
.await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
Ok(new_timeline) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
|
||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||
Err(tenant::CreateTimelineError::AlreadyExists) => {
|
||||
json_response(StatusCode::CONFLICT, ())
|
||||
}
|
||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
|
||||
json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
|
||||
format!("{err:#}")
|
||||
))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.instrument(info_span!("timeline_create", %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -373,7 +381,7 @@ async fn timeline_list_handler(
|
||||
}
|
||||
Ok::<Vec<TimelineInfo>, ApiError>(response_data)
|
||||
}
|
||||
.instrument(info_span!("timeline_list", tenant = %tenant_id))
|
||||
.instrument(info_span!("timeline_list", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
@@ -410,7 +418,7 @@ async fn timeline_detail_handler(
|
||||
|
||||
Ok::<_, ApiError>(timeline_info)
|
||||
}
|
||||
.instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, timeline_info)
|
||||
@@ -471,7 +479,7 @@ async fn tenant_attach_handler(
|
||||
remote_storage.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
|
||||
.instrument(info_span!("tenant_attach", %tenant_id))
|
||||
.await?;
|
||||
} else {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
@@ -493,7 +501,7 @@ async fn timeline_delete_handler(
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
mgr::delete_timeline(tenant_id, timeline_id, &ctx)
|
||||
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
|
||||
.await?;
|
||||
|
||||
// FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
|
||||
@@ -511,7 +519,7 @@ async fn tenant_detach_handler(
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
|
||||
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
|
||||
.instrument(info_span!("tenant_detach", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -534,7 +542,7 @@ async fn tenant_load_handler(
|
||||
state.remote_storage.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("load", tenant = %tenant_id))
|
||||
.instrument(info_span!("load", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
@@ -550,7 +558,7 @@ async fn tenant_ignore_handler(
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
mgr::ignore_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("ignore_tenant", tenant = %tenant_id))
|
||||
.instrument(info_span!("ignore_tenant", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -603,7 +611,7 @@ async fn tenant_status(
|
||||
attachment_status: state.attachment_status(),
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
|
||||
.instrument(info_span!("tenant_status_handler", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, tenant_info)
|
||||
@@ -842,7 +850,7 @@ async fn tenant_create_handler(
|
||||
state.remote_storage.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
|
||||
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
|
||||
.await?;
|
||||
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
@@ -904,7 +912,7 @@ async fn update_tenant_config_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
||||
.instrument(info_span!("tenant_config", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -1135,7 +1143,7 @@ async fn disk_usage_eviction_run(
|
||||
let Some(storage) = state.remote_storage.clone() else {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run eviction iteration"
|
||||
)))
|
||||
)));
|
||||
};
|
||||
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use metrics::metric_vec_duration::DurationResultObserver;
|
||||
use metrics::{
|
||||
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
|
||||
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
|
||||
Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
|
||||
UIntGauge, UIntGaugeVec,
|
||||
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
|
||||
register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
|
||||
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::TenantState;
|
||||
@@ -130,6 +130,122 @@ pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub struct PageCacheMetrics {
|
||||
pub read_accesses_materialized_page: IntCounter,
|
||||
pub read_accesses_ephemeral: IntCounter,
|
||||
pub read_accesses_immutable: IntCounter,
|
||||
|
||||
pub read_hits_ephemeral: IntCounter,
|
||||
pub read_hits_immutable: IntCounter,
|
||||
pub read_hits_materialized_page_exact: IntCounter,
|
||||
pub read_hits_materialized_page_older_lsn: IntCounter,
|
||||
}
|
||||
|
||||
static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_page_cache_read_hits_total",
|
||||
"Number of read accesses to the page cache that hit",
|
||||
&["key_kind", "hit_kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_page_cache_read_accesses_total",
|
||||
"Number of read accesses to the page cache",
|
||||
&["key_kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
|
||||
read_accesses_materialized_page: {
|
||||
PAGE_CACHE_READ_ACCESSES
|
||||
.get_metric_with_label_values(&["materialized_page"])
|
||||
.unwrap()
|
||||
},
|
||||
|
||||
read_accesses_ephemeral: {
|
||||
PAGE_CACHE_READ_ACCESSES
|
||||
.get_metric_with_label_values(&["ephemeral"])
|
||||
.unwrap()
|
||||
},
|
||||
|
||||
read_accesses_immutable: {
|
||||
PAGE_CACHE_READ_ACCESSES
|
||||
.get_metric_with_label_values(&["immutable"])
|
||||
.unwrap()
|
||||
},
|
||||
|
||||
read_hits_ephemeral: {
|
||||
PAGE_CACHE_READ_HITS
|
||||
.get_metric_with_label_values(&["ephemeral", "-"])
|
||||
.unwrap()
|
||||
},
|
||||
|
||||
read_hits_immutable: {
|
||||
PAGE_CACHE_READ_HITS
|
||||
.get_metric_with_label_values(&["immutable", "-"])
|
||||
.unwrap()
|
||||
},
|
||||
|
||||
read_hits_materialized_page_exact: {
|
||||
PAGE_CACHE_READ_HITS
|
||||
.get_metric_with_label_values(&["materialized_page", "exact"])
|
||||
.unwrap()
|
||||
},
|
||||
|
||||
read_hits_materialized_page_older_lsn: {
|
||||
PAGE_CACHE_READ_HITS
|
||||
.get_metric_with_label_values(&["materialized_page", "older_lsn"])
|
||||
.unwrap()
|
||||
},
|
||||
});
|
||||
|
||||
pub struct PageCacheSizeMetrics {
|
||||
pub max_bytes: UIntGauge,
|
||||
|
||||
pub current_bytes_ephemeral: UIntGauge,
|
||||
pub current_bytes_immutable: UIntGauge,
|
||||
pub current_bytes_materialized_page: UIntGauge,
|
||||
}
|
||||
|
||||
static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_page_cache_size_current_bytes",
|
||||
"Current size of the page cache in bytes, by key kind",
|
||||
&["key_kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
|
||||
max_bytes: {
|
||||
register_uint_gauge!(
|
||||
"pageserver_page_cache_size_max_bytes",
|
||||
"Maximum size of the page cache in bytes"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
},
|
||||
|
||||
current_bytes_ephemeral: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["ephemeral"])
|
||||
.unwrap()
|
||||
},
|
||||
current_bytes_immutable: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["immutable"])
|
||||
.unwrap()
|
||||
},
|
||||
current_bytes_materialized_page: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["materialized_page"])
|
||||
.unwrap()
|
||||
},
|
||||
});
|
||||
|
||||
static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_wait_lsn_seconds",
|
||||
@@ -204,11 +320,11 @@ pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
|
||||
pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_synthetic_size",
|
||||
"Synthetic size of each tenant",
|
||||
"pageserver_tenant_synthetic_cached_size_bytes",
|
||||
"Synthetic size of each tenant in bytes",
|
||||
&["tenant_id"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_synthetic_size metric")
|
||||
.expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
|
||||
});
|
||||
|
||||
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
||||
@@ -269,7 +385,7 @@ pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
#[derive(Debug)]
|
||||
pub struct EvictionsWithLowResidenceDuration {
|
||||
data_source: &'static str,
|
||||
@@ -425,6 +541,17 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// keep in sync with control plane Go code so that we can validate
|
||||
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
|
||||
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
// Go code uses milliseconds. Variable is called `computeStartupBuckets`
|
||||
[
|
||||
5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
|
||||
1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
|
||||
]
|
||||
.map(|ms| (ms as f64) / 1000.0)
|
||||
});
|
||||
|
||||
pub struct BasebackupQueryTime(HistogramVec);
|
||||
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
BasebackupQueryTime({
|
||||
@@ -432,7 +559,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
"pageserver_basebackup_query_seconds",
|
||||
"Histogram of basebackup queries durations, by result type",
|
||||
&["result"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
COMPUTE_STARTUP_BUCKETS.to_vec(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
})
|
||||
@@ -702,7 +829,7 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
|
||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||
pub struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
start: Instant,
|
||||
@@ -760,7 +887,7 @@ impl StorageTimeMetrics {
|
||||
|
||||
/// Starts timing a new operation.
|
||||
///
|
||||
/// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
|
||||
/// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
|
||||
pub fn start_timer(&self) -> StorageTimeMetricsTimer {
|
||||
StorageTimeMetricsTimer::new(self.clone())
|
||||
}
|
||||
@@ -968,7 +1095,6 @@ impl RemoteTimelineClientMetrics {
|
||||
op_kind: &RemoteOpKind,
|
||||
status: &'static str,
|
||||
) -> Histogram {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.remote_operation_time.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str(), status);
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
@@ -990,7 +1116,6 @@ impl RemoteTimelineClientMetrics {
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> IntGauge {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.calls_unfinished_gauge.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
@@ -1011,7 +1136,6 @@ impl RemoteTimelineClientMetrics {
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> Histogram {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.calls_started_hist.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
@@ -1032,7 +1156,6 @@ impl RemoteTimelineClientMetrics {
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> IntCounter {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.bytes_started_counter.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
@@ -1053,7 +1176,6 @@ impl RemoteTimelineClientMetrics {
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> IntCounter {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.bytes_finished_counter.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
@@ -1145,7 +1267,7 @@ impl RemoteTimelineClientMetrics {
|
||||
/// Update the metrics that change when a call to the remote timeline client instance starts.
|
||||
///
|
||||
/// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
|
||||
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
|
||||
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
|
||||
/// is more suitable.
|
||||
/// Never do both.
|
||||
pub(crate) fn call_begin(
|
||||
@@ -1178,7 +1300,7 @@ impl RemoteTimelineClientMetrics {
|
||||
|
||||
/// Manually udpate the metrics that track completions, instead of using the guard object.
|
||||
/// Using the guard object is generally preferable.
|
||||
/// See [`call_begin`] for more context.
|
||||
/// See [`call_begin`](Self::call_begin) for more context.
|
||||
pub(crate) fn call_end(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
|
||||
@@ -53,8 +53,8 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::writeback_ephemeral_file;
|
||||
use crate::{metrics::PageCacheSizeMetrics, repository::Key};
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
const TEST_PAGE_CACHE_SIZE: usize = 50;
|
||||
@@ -187,6 +187,8 @@ pub struct PageCache {
|
||||
/// Index of the next candidate to evict, for the Clock replacement algorithm.
|
||||
/// This is interpreted modulo the page cache size.
|
||||
next_evict_slot: AtomicUsize,
|
||||
|
||||
size_metrics: &'static PageCacheSizeMetrics,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -313,6 +315,10 @@ impl PageCache {
|
||||
key: &Key,
|
||||
lsn: Lsn,
|
||||
) -> Option<(Lsn, PageReadGuard)> {
|
||||
crate::metrics::PAGE_CACHE
|
||||
.read_accesses_materialized_page
|
||||
.inc();
|
||||
|
||||
let mut cache_key = CacheKey::MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
tenant_id,
|
||||
@@ -323,8 +329,21 @@ impl PageCache {
|
||||
};
|
||||
|
||||
if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
|
||||
if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
|
||||
Some((lsn, guard))
|
||||
if let CacheKey::MaterializedPage {
|
||||
hash_key: _,
|
||||
lsn: available_lsn,
|
||||
} = cache_key
|
||||
{
|
||||
if available_lsn == lsn {
|
||||
crate::metrics::PAGE_CACHE
|
||||
.read_hits_materialized_page_exact
|
||||
.inc();
|
||||
} else {
|
||||
crate::metrics::PAGE_CACHE
|
||||
.read_hits_materialized_page_older_lsn
|
||||
.inc();
|
||||
}
|
||||
Some((available_lsn, guard))
|
||||
} else {
|
||||
panic!("unexpected key type in slot");
|
||||
}
|
||||
@@ -499,11 +518,31 @@ impl PageCache {
|
||||
/// ```
|
||||
///
|
||||
fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
|
||||
let (read_access, hit) = match cache_key {
|
||||
CacheKey::MaterializedPage { .. } => {
|
||||
unreachable!("Materialized pages use lookup_materialized_page")
|
||||
}
|
||||
CacheKey::EphemeralPage { .. } => (
|
||||
&crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
|
||||
&crate::metrics::PAGE_CACHE.read_hits_ephemeral,
|
||||
),
|
||||
CacheKey::ImmutableFilePage { .. } => (
|
||||
&crate::metrics::PAGE_CACHE.read_accesses_immutable,
|
||||
&crate::metrics::PAGE_CACHE.read_hits_immutable,
|
||||
),
|
||||
};
|
||||
read_access.inc();
|
||||
|
||||
let mut is_first_iteration = true;
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(read_guard) = self.try_lock_for_read(cache_key) {
|
||||
if is_first_iteration {
|
||||
hit.inc();
|
||||
}
|
||||
return Ok(ReadBufResult::Found(read_guard));
|
||||
}
|
||||
is_first_iteration = false;
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) =
|
||||
@@ -681,6 +720,9 @@ impl PageCache {
|
||||
|
||||
if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
|
||||
versions.remove(version_idx);
|
||||
self.size_metrics
|
||||
.current_bytes_materialized_page
|
||||
.sub_page_sz(1);
|
||||
if versions.is_empty() {
|
||||
old_entry.remove_entry();
|
||||
}
|
||||
@@ -693,11 +735,13 @@ impl PageCache {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
map.remove(&(*file_id, *blkno))
|
||||
.expect("could not find old key in mapping");
|
||||
self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let mut map = self.immutable_page_map.write().unwrap();
|
||||
map.remove(&(*file_id, *blkno))
|
||||
.expect("could not find old key in mapping");
|
||||
self.size_metrics.current_bytes_immutable.sub_page_sz(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -725,6 +769,9 @@ impl PageCache {
|
||||
slot_idx,
|
||||
},
|
||||
);
|
||||
self.size_metrics
|
||||
.current_bytes_materialized_page
|
||||
.add_page_sz(1);
|
||||
None
|
||||
}
|
||||
}
|
||||
@@ -735,6 +782,7 @@ impl PageCache {
|
||||
Entry::Occupied(entry) => Some(*entry.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(slot_idx);
|
||||
self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
|
||||
None
|
||||
}
|
||||
}
|
||||
@@ -745,6 +793,7 @@ impl PageCache {
|
||||
Entry::Occupied(entry) => Some(*entry.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(slot_idx);
|
||||
self.size_metrics.current_bytes_immutable.add_page_sz(1);
|
||||
None
|
||||
}
|
||||
}
|
||||
@@ -844,6 +893,12 @@ impl PageCache {
|
||||
|
||||
let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
|
||||
|
||||
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
|
||||
size_metrics.max_bytes.set_page_sz(num_pages);
|
||||
size_metrics.current_bytes_ephemeral.set_page_sz(0);
|
||||
size_metrics.current_bytes_immutable.set_page_sz(0);
|
||||
size_metrics.current_bytes_materialized_page.set_page_sz(0);
|
||||
|
||||
let slots = page_buffer
|
||||
.chunks_exact_mut(PAGE_SZ)
|
||||
.map(|chunk| {
|
||||
@@ -866,6 +921,30 @@ impl PageCache {
|
||||
immutable_page_map: Default::default(),
|
||||
slots,
|
||||
next_evict_slot: AtomicUsize::new(0),
|
||||
size_metrics,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trait PageSzBytesMetric {
|
||||
fn set_page_sz(&self, count: usize);
|
||||
fn add_page_sz(&self, count: usize);
|
||||
fn sub_page_sz(&self, count: usize);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn count_times_page_sz(count: usize) -> u64 {
|
||||
u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
|
||||
}
|
||||
|
||||
impl PageSzBytesMetric for metrics::UIntGauge {
|
||||
fn set_page_sz(&self, count: usize) {
|
||||
self.set(count_times_page_sz(count));
|
||||
}
|
||||
fn add_page_sz(&self, count: usize) {
|
||||
self.add(count_times_page_sz(count));
|
||||
}
|
||||
fn sub_page_sz(&self, count: usize) {
|
||||
self.sub(count_times_page_sz(count));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
//
|
||||
|
||||
use anyhow::Context;
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Buf;
|
||||
use bytes::Bytes;
|
||||
use futures::Stream;
|
||||
@@ -31,8 +32,10 @@ use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tracing::field;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
use utils::{
|
||||
@@ -51,6 +54,7 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::GetTenantError;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
@@ -238,6 +242,7 @@ pub async fn libpq_listener_main(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(peer_addr))]
|
||||
async fn page_service_conn_main(
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
@@ -260,6 +265,7 @@ async fn page_service_conn_main(
|
||||
.context("could not set TCP_NODELAY")?;
|
||||
|
||||
let peer_addr = socket.peer_addr().context("get peer address")?;
|
||||
tracing::Span::current().record("peer_addr", field::display(peer_addr));
|
||||
|
||||
// setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
|
||||
// - long enough for most valid compute connections
|
||||
@@ -362,7 +368,7 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
#[instrument(skip_all)]
|
||||
async fn handle_pagerequests<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -373,6 +379,8 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// NOTE: pagerequests handler exits when connection is closed,
|
||||
// so there is no need to reset the association
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
@@ -473,7 +481,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
#[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
|
||||
async fn handle_import_basebackup<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -487,6 +495,8 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
@@ -531,7 +541,7 @@ impl PageServerHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
#[instrument(skip_all, fields(%start_lsn, %end_lsn))]
|
||||
async fn handle_import_wal<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -544,6 +554,7 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
@@ -738,7 +749,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
#[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
|
||||
async fn handle_basebackup_request<IO>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -747,11 +758,14 @@ impl PageServerHandler {
|
||||
lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
// check that the timeline exists
|
||||
@@ -772,8 +786,9 @@ impl PageServerHandler {
|
||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
// Send a tarball of the latest layer on the timeline
|
||||
{
|
||||
// Send a tarball of the latest layer on the timeline. Compress if not
|
||||
// fullbackup. TODO Compress in that case too (tests need to be updated)
|
||||
if full_backup {
|
||||
let mut writer = pgb.copyout_writer();
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut writer,
|
||||
@@ -784,6 +799,40 @@ impl PageServerHandler {
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
let mut writer = pgb.copyout_writer();
|
||||
if gzip {
|
||||
let mut encoder = GzipEncoder::with_quality(
|
||||
writer,
|
||||
// NOTE using fast compression because it's on the critical path
|
||||
// for compute startup. For an empty database, we get
|
||||
// <100KB with this method. The Level::Best compression method
|
||||
// gives us <20KB, but maybe we should add basebackup caching
|
||||
// on compute shutdown first.
|
||||
async_compression::Level::Fastest,
|
||||
);
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut encoder,
|
||||
&timeline,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
// shutdown the encoder to ensure the gzip footer is written
|
||||
encoder.shutdown().await?;
|
||||
} else {
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut writer,
|
||||
&timeline,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||
@@ -862,6 +911,7 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id, timeline_id))]
|
||||
async fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -883,6 +933,10 @@ where
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
|
||||
@@ -902,9 +956,13 @@ where
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let lsn = if params.len() == 3 {
|
||||
let lsn = if params.len() >= 3 {
|
||||
Some(
|
||||
Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
|
||||
@@ -913,6 +971,19 @@ where
|
||||
None
|
||||
};
|
||||
|
||||
let gzip = if params.len() >= 4 {
|
||||
if params[3] == "--gzip" {
|
||||
true
|
||||
} else {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Parameter in position 3 unknown {}",
|
||||
params[3],
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*crate::metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
@@ -923,6 +994,7 @@ where
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -948,6 +1020,10 @@ where
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
|
||||
@@ -979,6 +1055,10 @@ where
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if params.len() > 2 {
|
||||
Some(
|
||||
@@ -1000,8 +1080,17 @@ where
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
|
||||
.await?;
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("import basebackup ") {
|
||||
// Import the `base` section (everything but the wal) of a basebackup.
|
||||
@@ -1033,6 +1122,10 @@ where
|
||||
let pg_version = u32::from_str(params[4])
|
||||
.with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
match self
|
||||
@@ -1077,6 +1170,10 @@ where
|
||||
let end_lsn = Lsn::from_str(params[3])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
match self
|
||||
@@ -1108,6 +1205,8 @@ where
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
|
||||
tracing::Span::current().record("tenant_id", field::display(tenant_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||
|
||||
@@ -887,7 +887,7 @@ impl<'a> DatadirModification<'a> {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), RelationError> {
|
||||
if rel.relnode == 0 {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
return Err(RelationError::InvalidRelnode);
|
||||
}
|
||||
// It's possible that this is the first rel for this db in this
|
||||
// tablespace. Create the reldir entry for it if so.
|
||||
@@ -1131,7 +1131,7 @@ impl<'a> DatadirModification<'a> {
|
||||
/// context, breaking the atomicity is OK. If the import is interrupted, the
|
||||
/// whole import fails and the timeline will be deleted anyway.
|
||||
/// (Or to be precise, it will be left behind for debugging purposes and
|
||||
/// ignored, see https://github.com/neondatabase/neon/pull/1809)
|
||||
/// ignored, see <https://github.com/neondatabase/neon/pull/1809>)
|
||||
///
|
||||
/// Note: A consequence of flushing the pending operations is that they
|
||||
/// won't be visible to subsequent operations until `commit`. The function
|
||||
|
||||
@@ -205,7 +205,7 @@ pub enum TaskKind {
|
||||
///
|
||||
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||
/// That abstraction doesn't use `task_mgr`.
|
||||
/// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
|
||||
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
|
||||
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
||||
///
|
||||
/// Once the connection is established, the `TaskHandle` task creates a
|
||||
@@ -213,16 +213,21 @@ pub enum TaskKind {
|
||||
/// the `Connection` object.
|
||||
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
||||
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
||||
///
|
||||
/// [`WalReceiverConnectionHandler`]: Self::WalReceiverConnectionHandler
|
||||
/// [`WalReceiverConnectionPoller`]: Self::WalReceiverConnectionPoller
|
||||
WalReceiverManager,
|
||||
|
||||
/// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
|
||||
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
|
||||
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
|
||||
/// See the comment on [`WalReceiverManager`].
|
||||
///
|
||||
/// [`WalReceiverManager`]: Self::WalReceiverManager
|
||||
WalReceiverConnectionHandler,
|
||||
|
||||
/// The task that polls the `tokio-postgres::Connection` object.
|
||||
/// Spawned by task [`WalReceiverConnectionHandler`].
|
||||
/// See the comment on [`WalReceiverManager`].
|
||||
/// Spawned by task [`WalReceiverConnectionHandler`](Self::WalReceiverConnectionHandler).
|
||||
/// See the comment on [`WalReceiverManager`](Self::WalReceiverManager).
|
||||
WalReceiverConnectionPoller,
|
||||
|
||||
// Garbage collection worker. One per tenant
|
||||
@@ -506,17 +511,17 @@ pub async fn shutdown_tasks(
|
||||
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
}
|
||||
let completed = tokio::select! {
|
||||
let join_handle = tokio::select! {
|
||||
biased;
|
||||
_ = &mut join_handle => { true },
|
||||
_ = &mut join_handle => { None },
|
||||
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
|
||||
// allow some time to elapse before logging to cut down the number of log
|
||||
// lines.
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
false
|
||||
Some(join_handle)
|
||||
}
|
||||
};
|
||||
if !completed {
|
||||
if let Some(join_handle) = join_handle {
|
||||
// we never handled this return value, but:
|
||||
// - we don't deschedule which would lead to is_cancelled
|
||||
// - panics are already logged (is_panicked)
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
//! parent timeline, and the last LSN that has been written to disk.
|
||||
//!
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use anyhow::{bail, Context};
|
||||
use futures::FutureExt;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use remote_storage::DownloadError;
|
||||
@@ -49,6 +49,8 @@ use std::time::{Duration, Instant};
|
||||
use self::config::TenantConf;
|
||||
use self::metadata::TimelineMetadata;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
@@ -68,6 +70,7 @@ use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::InitializationOrder;
|
||||
|
||||
use crate::tenant::timeline::uninit::cleanup_timeline_directory;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::walredo::WalRedoManager;
|
||||
@@ -81,12 +84,32 @@ use utils::{
|
||||
lsn::{Lsn, RecordLsn},
|
||||
};
|
||||
|
||||
/// Declare a failpoint that can use the `pause` failpoint action.
|
||||
/// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
macro_rules! pausable_failpoint {
|
||||
($name:literal) => {
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint {}", $name);
|
||||
fail::fail_point!($name);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub mod blob_io;
|
||||
pub mod block_io;
|
||||
pub mod disk_btree;
|
||||
pub(crate) mod ephemeral_file;
|
||||
pub mod layer_map;
|
||||
pub mod manifest;
|
||||
mod span;
|
||||
|
||||
pub mod metadata;
|
||||
mod par_fsync;
|
||||
@@ -102,7 +125,7 @@ mod timeline;
|
||||
|
||||
pub mod size;
|
||||
|
||||
pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
pub use timeline::{
|
||||
LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
|
||||
};
|
||||
@@ -110,7 +133,7 @@ pub use timeline::{
|
||||
// re-export this function so that page_cache.rs can use it.
|
||||
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
|
||||
|
||||
// re-export for use in storage_sync.rs
|
||||
// re-export for use in remote_timeline_client.rs
|
||||
pub use crate::tenant::metadata::save_metadata;
|
||||
|
||||
// re-export for use in walreceiver
|
||||
@@ -161,200 +184,6 @@ pub struct Tenant {
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
}
|
||||
|
||||
/// A timeline with some of its files on disk, being initialized.
|
||||
/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
|
||||
/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
|
||||
/// to be removed on next restart.
|
||||
///
|
||||
/// The caller is responsible for proper timeline data filling before the final init.
|
||||
#[must_use]
|
||||
pub struct UninitializedTimeline<'t> {
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
|
||||
}
|
||||
|
||||
/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
|
||||
/// or gets removed eventually.
|
||||
///
|
||||
/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
|
||||
#[must_use]
|
||||
struct TimelineUninitMark {
|
||||
uninit_mark_deleted: bool,
|
||||
uninit_mark_path: PathBuf,
|
||||
timeline_path: PathBuf,
|
||||
}
|
||||
|
||||
impl UninitializedTimeline<'_> {
|
||||
/// Finish timeline creation: insert it into the Tenant's timelines map and remove the
|
||||
/// uninit mark file.
|
||||
///
|
||||
/// This function launches the flush loop if not already done.
|
||||
///
|
||||
/// The caller is responsible for activating the timeline (function `.activate()`).
|
||||
fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timeline_id = self.timeline_id;
|
||||
let tenant_id = self.owning_tenant.tenant_id;
|
||||
|
||||
let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
|
||||
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
// Check that the caller initialized disk_consistent_lsn
|
||||
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
|
||||
ensure!(
|
||||
new_disk_consistent_lsn.is_valid(),
|
||||
"new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
|
||||
);
|
||||
|
||||
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
|
||||
match timelines.entry(timeline_id) {
|
||||
Entry::Occupied(_) => anyhow::bail!(
|
||||
"Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
|
||||
),
|
||||
Entry::Vacant(v) => {
|
||||
uninit_mark.remove_uninit_mark().with_context(|| {
|
||||
format!(
|
||||
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
|
||||
)
|
||||
})?;
|
||||
v.insert(Arc::clone(&new_timeline));
|
||||
|
||||
new_timeline.maybe_spawn_flush_loop();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(new_timeline)
|
||||
}
|
||||
|
||||
/// Prepares timeline data by loading it from the basebackup archive.
|
||||
pub async fn import_basebackup_from_tar(
|
||||
self,
|
||||
copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
|
||||
base_lsn: Lsn,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let raw_timeline = self.raw_timeline()?;
|
||||
|
||||
import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
|
||||
.await
|
||||
.context("Failed to import basebackup")?;
|
||||
|
||||
// Flush the new layer files to disk, before we make the timeline as available to
|
||||
// the outside world.
|
||||
//
|
||||
// Flush loop needs to be spawned in order to be able to flush.
|
||||
raw_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
|
||||
raw_timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.context("Failed to flush after basebackup import")?;
|
||||
|
||||
// All the data has been imported. Insert the Timeline into the tenant's timelines
|
||||
// map and remove the uninit mark file.
|
||||
let tl = self.finish_creation()?;
|
||||
tl.activate(broker_client, None, ctx);
|
||||
Ok(tl)
|
||||
}
|
||||
|
||||
fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
|
||||
Ok(&self
|
||||
.raw_timeline
|
||||
.as_ref()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"No raw timeline {}/{} found",
|
||||
self.owning_tenant.tenant_id, self.timeline_id
|
||||
)
|
||||
})?
|
||||
.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for UninitializedTimeline<'_> {
|
||||
fn drop(&mut self) {
|
||||
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
|
||||
let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
|
||||
error!("Timeline got dropped without initializing, cleaning its files");
|
||||
cleanup_timeline_directory(uninit_mark);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
|
||||
let timeline_path = &uninit_mark.timeline_path;
|
||||
match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
|
||||
Ok(()) => {
|
||||
info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
|
||||
}
|
||||
}
|
||||
drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
|
||||
}
|
||||
|
||||
impl TimelineUninitMark {
|
||||
fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
|
||||
Self {
|
||||
uninit_mark_deleted: false,
|
||||
uninit_mark_path,
|
||||
timeline_path,
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
|
||||
if !self.uninit_mark_deleted {
|
||||
self.delete_mark_file_if_present()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
|
||||
let uninit_mark_file = &self.uninit_mark_path;
|
||||
let uninit_mark_parent = uninit_mark_file
|
||||
.parent()
|
||||
.with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
|
||||
ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
|
||||
format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
|
||||
})?;
|
||||
crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
|
||||
self.uninit_mark_deleted = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TimelineUninitMark {
|
||||
fn drop(&mut self) {
|
||||
if !self.uninit_mark_deleted {
|
||||
if self.timeline_path.exists() {
|
||||
error!(
|
||||
"Uninit mark {} is not removed, timeline {} stays uninitialized",
|
||||
self.uninit_mark_path.display(),
|
||||
self.timeline_path.display()
|
||||
)
|
||||
} else {
|
||||
// unblock later timeline creation attempts
|
||||
warn!(
|
||||
"Removing intermediate uninit mark file {}",
|
||||
self.uninit_mark_path.display()
|
||||
);
|
||||
if let Err(e) = self.delete_mark_file_if_present() {
|
||||
error!("Failed to remove the uninit mark file: {e}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We should not blindly overwrite local metadata with remote one.
|
||||
// For example, consider the following case:
|
||||
// Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
|
||||
@@ -440,8 +269,13 @@ pub enum GetTimelineError {
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("NotFound")]
|
||||
NotFound,
|
||||
|
||||
#[error("HasChildren")]
|
||||
HasChildren(Vec<TimelineId>),
|
||||
|
||||
#[error("Timeline deletion is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
@@ -496,6 +330,16 @@ impl DeletionGuard {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum CreateTimelineError {
|
||||
#[error("a timeline with the given ID already exists")]
|
||||
AlreadyExists,
|
||||
#[error(transparent)]
|
||||
AncestorLsn(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl Tenant {
|
||||
/// Yet another helper for timeline initialization.
|
||||
/// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
|
||||
@@ -585,6 +429,7 @@ impl Tenant {
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.iter_historic_layers()
|
||||
.next()
|
||||
.is_some(),
|
||||
@@ -595,8 +440,8 @@ impl Tenant {
|
||||
if !picked_local {
|
||||
save_metadata(
|
||||
self.conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
up_to_date_metadata,
|
||||
first_save,
|
||||
)
|
||||
@@ -625,7 +470,7 @@ impl Tenant {
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
// TODO dedup with spawn_load
|
||||
let tenant_conf =
|
||||
Self::load_tenant_config(conf, tenant_id).context("load tenant config")?;
|
||||
Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;
|
||||
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
@@ -679,7 +524,7 @@ impl Tenant {
|
||||
/// No background tasks are started as part of this routine.
|
||||
///
|
||||
async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
|
||||
if !tokio::fs::try_exists(&marker_file)
|
||||
@@ -734,7 +579,7 @@ impl Tenant {
|
||||
.map(move |res| {
|
||||
res.with_context(|| format!("download index part for timeline {timeline_id}"))
|
||||
})
|
||||
.instrument(info_span!("download_index_part", timeline=%timeline_id)),
|
||||
.instrument(info_span!("download_index_part", %timeline_id)),
|
||||
);
|
||||
}
|
||||
// Wait for all the download tasks to complete & collect results.
|
||||
@@ -817,10 +662,10 @@ impl Tenant {
|
||||
remote_client: RemoteTimelineClient,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
info!("downloading index file for timeline {}", timeline_id);
|
||||
tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
|
||||
tokio::fs::create_dir_all(self.conf.timeline_path(&self.tenant_id, &timeline_id))
|
||||
.await
|
||||
.context("Failed to create new timeline directory")?;
|
||||
|
||||
@@ -896,9 +741,9 @@ impl Tenant {
|
||||
init_order: Option<InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> Arc<Tenant> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
|
||||
let tenant_conf = match Self::load_tenant_config(conf, &tenant_id) {
|
||||
Ok(conf) => conf,
|
||||
Err(e) => {
|
||||
error!("load tenant config failed: {:?}", e);
|
||||
@@ -1009,7 +854,7 @@ impl Tenant {
|
||||
timeline_uninit_mark_file.display()
|
||||
)
|
||||
})?;
|
||||
let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
||||
let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id);
|
||||
if let Err(e) =
|
||||
remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
|
||||
{
|
||||
@@ -1054,7 +899,7 @@ impl Tenant {
|
||||
if let Ok(timeline_id) =
|
||||
file_name.to_str().unwrap_or_default().parse::<TimelineId>()
|
||||
{
|
||||
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
|
||||
let metadata = load_metadata(self.conf, &self.tenant_id, &timeline_id)
|
||||
.context("failed to load metadata")?;
|
||||
timelines_to_load.insert(timeline_id, metadata);
|
||||
} else {
|
||||
@@ -1082,7 +927,7 @@ impl Tenant {
|
||||
init_order: Option<&InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
debug!("loading tenant task");
|
||||
|
||||
@@ -1128,7 +973,7 @@ impl Tenant {
|
||||
init_order: Option<&InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
|
||||
RemoteTimelineClient::new(
|
||||
@@ -1369,8 +1214,7 @@ impl Tenant {
|
||||
/// Returns the new timeline ID and reference to its Timeline object.
|
||||
///
|
||||
/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
|
||||
/// the same timeline ID already exists, returns None. If `new_timeline_id` is not given,
|
||||
/// a new unique ID is generated.
|
||||
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
|
||||
pub async fn create_timeline(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
@@ -1379,11 +1223,12 @@ impl Tenant {
|
||||
pg_version: u32,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<Arc<Timeline>>> {
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
"Cannot create timelines on inactive tenant"
|
||||
);
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
if !self.is_active() {
|
||||
return Err(CreateTimelineError::Other(anyhow::anyhow!(
|
||||
"Cannot create timelines on inactive tenant"
|
||||
)));
|
||||
}
|
||||
|
||||
if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
|
||||
debug!("timeline {new_timeline_id} already exists");
|
||||
@@ -1403,7 +1248,7 @@ impl Tenant {
|
||||
.context("wait for timeline uploads to complete")?;
|
||||
}
|
||||
|
||||
return Ok(None);
|
||||
return Err(CreateTimelineError::AlreadyExists);
|
||||
}
|
||||
|
||||
let loaded_timeline = match ancestor_timeline_id {
|
||||
@@ -1418,12 +1263,12 @@ impl Tenant {
|
||||
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
|
||||
if ancestor_ancestor_lsn > *lsn {
|
||||
// can we safely just branch from the ancestor instead?
|
||||
bail!(
|
||||
return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
|
||||
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
|
||||
lsn,
|
||||
ancestor_timeline_id,
|
||||
ancestor_ancestor_lsn,
|
||||
);
|
||||
)));
|
||||
}
|
||||
|
||||
// Wait for the WAL to arrive and be processed on the parent branch up
|
||||
@@ -1457,7 +1302,7 @@ impl Tenant {
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(Some(loaded_timeline))
|
||||
Ok(loaded_timeline)
|
||||
}
|
||||
|
||||
/// perform one garbage collection iteration, removing old data files from disk.
|
||||
@@ -1523,7 +1368,7 @@ impl Tenant {
|
||||
for (timeline_id, timeline) in &timelines_to_compact {
|
||||
timeline
|
||||
.compact(ctx)
|
||||
.instrument(info_span!("compact_timeline", timeline = %timeline_id))
|
||||
.instrument(info_span!("compact_timeline", %timeline_id))
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -1614,12 +1459,12 @@ impl Tenant {
|
||||
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
||||
info!("got layer_removal_cs.lock(), deleting layer files");
|
||||
|
||||
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
||||
// NB: remote_timeline_client upload tasks that reference these layers have been cancelled
|
||||
// by the caller.
|
||||
|
||||
let local_timeline_directory = self
|
||||
.conf
|
||||
.timeline_path(&timeline.timeline_id, &self.tenant_id);
|
||||
.timeline_path(&self.tenant_id, &timeline.timeline_id);
|
||||
|
||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
||||
@@ -1672,20 +1517,7 @@ impl Tenant {
|
||||
remote_client.delete_all().await.context("delete_all")?
|
||||
};
|
||||
|
||||
// Have a failpoint that can use the `pause` failpoint action.
|
||||
// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint in_progress_delete");
|
||||
fail::fail_point!("in_progress_delete");
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
|
||||
{
|
||||
// Remove the timeline from the map.
|
||||
@@ -1719,7 +1551,7 @@ impl Tenant {
|
||||
timeline_id: TimelineId,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
timeline::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Transition the timeline into TimelineState::Stopping.
|
||||
// This should prevent new operations from starting.
|
||||
@@ -1755,14 +1587,11 @@ impl Tenant {
|
||||
timeline = Arc::clone(timeline_entry.get());
|
||||
|
||||
// Prevent two tasks from trying to delete the timeline at the same time.
|
||||
delete_lock_guard =
|
||||
DeletionGuard(Arc::clone(&timeline.delete_lock).try_lock_owned().map_err(
|
||||
|_| {
|
||||
DeleteTimelineError::Other(anyhow::anyhow!(
|
||||
"timeline deletion is already in progress"
|
||||
))
|
||||
},
|
||||
)?);
|
||||
delete_lock_guard = DeletionGuard(
|
||||
Arc::clone(&timeline.delete_lock)
|
||||
.try_lock_owned()
|
||||
.map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
|
||||
);
|
||||
|
||||
// If another task finished the deletion just before we acquired the lock,
|
||||
// return success.
|
||||
@@ -1886,7 +1715,7 @@ impl Tenant {
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let mut activating = false;
|
||||
self.state.send_modify(|current_state| {
|
||||
@@ -1957,7 +1786,7 @@ impl Tenant {
|
||||
///
|
||||
/// This will attempt to shutdown even if tenant is broken.
|
||||
pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
// Set tenant (and its timlines) to Stoppping state.
|
||||
//
|
||||
// Since we can only transition into Stopping state after activation is complete,
|
||||
@@ -2403,7 +2232,7 @@ impl Tenant {
|
||||
/// Locate and load config
|
||||
pub(super) fn load_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: &TenantId,
|
||||
) -> anyhow::Result<TenantConfOpt> {
|
||||
let target_config_path = conf.tenant_config_path(tenant_id);
|
||||
let target_config_display = target_config_path.display();
|
||||
@@ -2704,7 +2533,7 @@ impl Tenant {
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let tl = self
|
||||
.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
|
||||
.await?;
|
||||
@@ -2721,7 +2550,7 @@ impl Tenant {
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
|
||||
.await
|
||||
}
|
||||
@@ -2732,7 +2561,7 @@ impl Tenant {
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let src_id = src_timeline.timeline_id;
|
||||
|
||||
// If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
|
||||
@@ -2772,16 +2601,17 @@ impl Tenant {
|
||||
.context(format!(
|
||||
"invalid branch start lsn: less than latest GC cutoff {}",
|
||||
*latest_gc_cutoff_lsn,
|
||||
))?;
|
||||
))
|
||||
.map_err(CreateTimelineError::AncestorLsn)?;
|
||||
|
||||
// and then the planned GC cutoff
|
||||
{
|
||||
let gc_info = src_timeline.gc_info.read().unwrap();
|
||||
let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
|
||||
if start_lsn < cutoff {
|
||||
bail!(format!(
|
||||
return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
|
||||
"invalid branch start lsn: less than planned GC cutoff {cutoff}"
|
||||
));
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2989,7 +2819,7 @@ impl Tenant {
|
||||
timeline_struct.init_empty_layer_map(start_lsn);
|
||||
|
||||
if let Err(e) =
|
||||
self.create_timeline_files(&uninit_mark.timeline_path, new_timeline_id, new_metadata)
|
||||
self.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
|
||||
{
|
||||
error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}");
|
||||
cleanup_timeline_directory(uninit_mark);
|
||||
@@ -2998,17 +2828,17 @@ impl Tenant {
|
||||
|
||||
debug!("Successfully created initial files for timeline {tenant_id}/{new_timeline_id}");
|
||||
|
||||
Ok(UninitializedTimeline {
|
||||
owning_tenant: self,
|
||||
timeline_id: new_timeline_id,
|
||||
raw_timeline: Some((timeline_struct, uninit_mark)),
|
||||
})
|
||||
Ok(UninitializedTimeline::new(
|
||||
self,
|
||||
new_timeline_id,
|
||||
Some((timeline_struct, uninit_mark)),
|
||||
))
|
||||
}
|
||||
|
||||
fn create_timeline_files(
|
||||
&self,
|
||||
timeline_path: &Path,
|
||||
new_timeline_id: TimelineId,
|
||||
new_timeline_id: &TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
|
||||
@@ -3019,8 +2849,8 @@ impl Tenant {
|
||||
|
||||
save_metadata(
|
||||
self.conf,
|
||||
&self.tenant_id,
|
||||
new_timeline_id,
|
||||
self.tenant_id,
|
||||
new_metadata,
|
||||
true,
|
||||
)
|
||||
@@ -3043,7 +2873,7 @@ impl Tenant {
|
||||
timelines.get(&timeline_id).is_none(),
|
||||
"Timeline {tenant_id}/{timeline_id} already exists in pageserver's memory"
|
||||
);
|
||||
let timeline_path = self.conf.timeline_path(&timeline_id, &tenant_id);
|
||||
let timeline_path = self.conf.timeline_path(&tenant_id, &timeline_id);
|
||||
anyhow::ensure!(
|
||||
!timeline_path.exists(),
|
||||
"Timeline {} already exists, cannot create its uninit mark file",
|
||||
@@ -3174,10 +3004,10 @@ pub(crate) enum CreateTenantFilesMode {
|
||||
pub(crate) fn create_tenant_files(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: &TenantId,
|
||||
mode: CreateTenantFilesMode,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let target_tenant_directory = conf.tenant_path(&tenant_id);
|
||||
let target_tenant_directory = conf.tenant_path(tenant_id);
|
||||
anyhow::ensure!(
|
||||
!target_tenant_directory
|
||||
.try_exists()
|
||||
@@ -3228,7 +3058,7 @@ pub(crate) fn create_tenant_files(
|
||||
fn try_create_target_tenant_dir(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: &TenantId,
|
||||
mode: CreateTenantFilesMode,
|
||||
temporary_tenant_dir: &Path,
|
||||
target_tenant_directory: &Path,
|
||||
@@ -3252,7 +3082,7 @@ fn try_create_target_tenant_dir(
|
||||
}
|
||||
|
||||
let temporary_tenant_timelines_dir = rebase_directory(
|
||||
&conf.timelines_path(&tenant_id),
|
||||
&conf.timelines_path(tenant_id),
|
||||
target_tenant_directory,
|
||||
temporary_tenant_dir,
|
||||
)
|
||||
@@ -3264,7 +3094,7 @@ fn try_create_target_tenant_dir(
|
||||
)
|
||||
.with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
|
||||
|
||||
Tenant::persist_tenant_config(&tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
|
||||
Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
|
||||
|
||||
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
|
||||
format!(
|
||||
@@ -3552,7 +3382,7 @@ pub mod harness {
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &TimelineId) -> PathBuf {
|
||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
||||
self.conf.timeline_path(&self.tenant_id, timeline_id)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3814,6 +3644,9 @@ mod tests {
|
||||
{
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
let CreateTimelineError::AncestorLsn(err) = err else {
|
||||
panic!("wrong error type")
|
||||
};
|
||||
assert!(err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(err
|
||||
.source()
|
||||
@@ -3843,6 +3676,9 @@ mod tests {
|
||||
{
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
let CreateTimelineError::AncestorLsn(err) = err else {
|
||||
panic!("wrong error type");
|
||||
};
|
||||
assert!(&err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(&err
|
||||
.source()
|
||||
@@ -4499,13 +4335,13 @@ mod tests {
|
||||
// assert freeze_and_flush exercised the initdb optimization
|
||||
{
|
||||
let state = tline.flush_loop_state.lock().unwrap();
|
||||
let
|
||||
timeline::FlushLoopState::Running {
|
||||
expect_initdb_optimization,
|
||||
initdb_optimization_count,
|
||||
} = *state else {
|
||||
panic!("unexpected state: {:?}", *state);
|
||||
};
|
||||
let timeline::FlushLoopState::Running {
|
||||
expect_initdb_optimization,
|
||||
initdb_optimization_count,
|
||||
} = *state
|
||||
else {
|
||||
panic!("unexpected state: {:?}", *state);
|
||||
};
|
||||
assert!(expect_initdb_optimization);
|
||||
assert!(initdb_optimization_count > 0);
|
||||
}
|
||||
@@ -4540,7 +4376,7 @@ mod tests {
|
||||
|
||||
assert!(!harness
|
||||
.conf
|
||||
.timeline_path(&TIMELINE_ID, &tenant.tenant_id)
|
||||
.timeline_path(&tenant.tenant_id, &TIMELINE_ID)
|
||||
.exists());
|
||||
|
||||
assert!(!harness
|
||||
@@ -4551,28 +4387,3 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(debug_assertions))]
|
||||
#[inline]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
|
||||
utils::tracing_span_assert::MultiNameExtractor<2>,
|
||||
> = once_cell::sync::Lazy::new(|| {
|
||||
utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
|
||||
});
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
#[inline]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {
|
||||
use utils::tracing_span_assert;
|
||||
|
||||
match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) {
|
||||
Ok(()) => (),
|
||||
Err(missing) => panic!(
|
||||
"missing extractors: {:?}",
|
||||
missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -442,7 +442,7 @@ where
|
||||
writer: W,
|
||||
|
||||
///
|
||||
/// stack[0] is the current root page, stack.last() is the leaf.
|
||||
/// `stack[0]` is the current root page, `stack.last()` is the leaf.
|
||||
///
|
||||
/// We maintain the length of the stack to be always greater than zero.
|
||||
/// Two exceptions are:
|
||||
|
||||
@@ -55,7 +55,7 @@ impl EphemeralFile {
|
||||
l.next_file_id += 1;
|
||||
|
||||
let filename = conf
|
||||
.timeline_path(&timeline_id, &tenant_id)
|
||||
.timeline_path(&tenant_id, &timeline_id)
|
||||
.join(PathBuf::from(format!("ephemeral-{}", file_id)));
|
||||
|
||||
let file = VirtualFile::open_with_options(
|
||||
@@ -346,7 +346,7 @@ mod tests {
|
||||
|
||||
let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
|
||||
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||
fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
|
||||
fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
|
||||
|
||||
Ok((conf, tenant_id, timeline_id))
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
//! Other read methods are less critical but still impact performance of background tasks.
|
||||
//!
|
||||
//! This data structure relies on a persistent/immutable binary search tree. See the
|
||||
//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
||||
//! following lecture for an introduction <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
|
||||
//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
|
||||
//! you to modify the tree in such a way that each modification creates a new "version"
|
||||
//! of the tree. When you modify it, you get a new version, but all previous versions are
|
||||
@@ -40,7 +40,7 @@
|
||||
//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
|
||||
//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
|
||||
//! to throw away most of the persistent BST and build a new one, starting from the oldest
|
||||
//! LSN. See `LayerMap::flush_updates()`.
|
||||
//! LSN. See [`LayerMap::flush_updates()`].
|
||||
//!
|
||||
|
||||
mod historic_layer_coverage;
|
||||
@@ -51,25 +51,22 @@ use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||
pub use historic_layer_coverage::Replacement;
|
||||
pub use historic_layer_coverage::LayerKey;
|
||||
|
||||
use super::storage_layer::range_eq;
|
||||
use super::storage_layer::PersistentLayerDesc;
|
||||
use super::storage_layer::PersistentLayerKey;
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
///
|
||||
pub struct LayerMap<L: ?Sized> {
|
||||
#[derive(Default)]
|
||||
pub struct LayerMap {
|
||||
//
|
||||
// 'open_layer' holds the current InMemoryLayer that is accepting new
|
||||
// records. If it is None, 'next_open_layer_at' will be set instead, indicating
|
||||
@@ -95,24 +92,6 @@ pub struct LayerMap<L: ?Sized> {
|
||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||
l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
|
||||
|
||||
/// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
|
||||
/// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
|
||||
/// RemoteLayer will be removed.
|
||||
mapping: HashMap<PersistentLayerKey, Arc<L>>,
|
||||
}
|
||||
|
||||
impl<L: ?Sized> Default for LayerMap<L> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
open_layer: None,
|
||||
next_open_layer_at: None,
|
||||
frozen_layers: VecDeque::default(),
|
||||
l0_delta_layers: Vec::default(),
|
||||
historic: BufferedHistoricLayerCoverage::default(),
|
||||
mapping: HashMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The primary update API for the layer map.
|
||||
@@ -120,24 +99,21 @@ impl<L: ?Sized> Default for LayerMap<L> {
|
||||
/// Batching historic layer insertions and removals is good for
|
||||
/// performance and this struct helps us do that correctly.
|
||||
#[must_use]
|
||||
pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
|
||||
pub struct BatchedUpdates<'a> {
|
||||
// While we hold this exclusive reference to the layer map the type checker
|
||||
// will prevent us from accidentally reading any unflushed updates.
|
||||
layer_map: &'a mut LayerMap<L>,
|
||||
layer_map: &'a mut LayerMap,
|
||||
}
|
||||
|
||||
/// Provide ability to batch more updates while hiding the read
|
||||
/// API so we don't accidentally read without flushing.
|
||||
impl<L> BatchedUpdates<'_, L>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
impl BatchedUpdates<'_> {
|
||||
///
|
||||
/// Insert an on-disk layer.
|
||||
///
|
||||
// TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
|
||||
pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
|
||||
self.layer_map.insert_historic_noflush(layer_desc, layer)
|
||||
pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
self.layer_map.insert_historic_noflush(layer_desc)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -145,31 +121,8 @@ where
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
|
||||
self.layer_map.remove_historic_noflush(layer_desc, layer)
|
||||
}
|
||||
|
||||
/// Replaces existing layer iff it is the `expected`.
|
||||
///
|
||||
/// If the expected layer has been removed it will not be inserted by this function.
|
||||
///
|
||||
/// Returned `Replacement` describes succeeding in replacement or the reason why it could not
|
||||
/// be done.
|
||||
///
|
||||
/// TODO replacement can be done without buffering and rebuilding layer map updates.
|
||||
/// One way to do that is to add a layer of indirection for returned values, so
|
||||
/// that we can replace values only by updating a hashmap.
|
||||
pub fn replace_historic(
|
||||
&mut self,
|
||||
expected_desc: PersistentLayerDesc,
|
||||
expected: &Arc<L>,
|
||||
new_desc: PersistentLayerDesc,
|
||||
new: Arc<L>,
|
||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
||||
fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
|
||||
|
||||
self.layer_map
|
||||
.replace_historic_noflush(expected_desc, expected, new_desc, new)
|
||||
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
self.layer_map.remove_historic_noflush(layer_desc)
|
||||
}
|
||||
|
||||
// We will flush on drop anyway, but this method makes it
|
||||
@@ -185,25 +138,19 @@ where
|
||||
// than panic later or read without flushing.
|
||||
//
|
||||
// TODO maybe warn if flush hasn't explicitly been called
|
||||
impl<L> Drop for BatchedUpdates<'_, L>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
impl Drop for BatchedUpdates<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.layer_map.flush_updates();
|
||||
}
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
pub struct SearchResult<L: ?Sized> {
|
||||
pub layer: Arc<L>,
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<PersistentLayerDesc>,
|
||||
pub lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
impl<L> LayerMap<L>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Find the latest layer (by lsn.end) that covers the given
|
||||
/// 'key', with lsn.start < 'end_lsn'.
|
||||
@@ -235,7 +182,7 @@ where
|
||||
/// NOTE: This only searches the 'historic' layers, *not* the
|
||||
/// 'open' and 'frozen' layers!
|
||||
///
|
||||
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
|
||||
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
|
||||
let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
|
||||
let latest_delta = version.delta_coverage.query(key.to_i128());
|
||||
let latest_image = version.image_coverage.query(key.to_i128());
|
||||
@@ -244,7 +191,6 @@ where
|
||||
(None, None) => None,
|
||||
(None, Some(image)) => {
|
||||
let lsn_floor = image.get_lsn_range().start;
|
||||
let image = self.get_layer_from_mapping(&image.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: image,
|
||||
lsn_floor,
|
||||
@@ -252,7 +198,6 @@ where
|
||||
}
|
||||
(Some(delta), None) => {
|
||||
let lsn_floor = delta.get_lsn_range().start;
|
||||
let delta = self.get_layer_from_mapping(&delta.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: delta,
|
||||
lsn_floor,
|
||||
@@ -263,7 +208,6 @@ where
|
||||
let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
|
||||
let image_exact_match = img_lsn + 1 == end_lsn;
|
||||
if image_is_newer || image_exact_match {
|
||||
let image = self.get_layer_from_mapping(&image.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: image,
|
||||
lsn_floor: img_lsn,
|
||||
@@ -271,7 +215,6 @@ where
|
||||
} else {
|
||||
let lsn_floor =
|
||||
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
|
||||
let delta = self.get_layer_from_mapping(&delta.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: delta,
|
||||
lsn_floor,
|
||||
@@ -282,7 +225,7 @@ where
|
||||
}
|
||||
|
||||
/// Start a batch of updates, applied on drop
|
||||
pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
|
||||
pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
|
||||
BatchedUpdates { layer_map: self }
|
||||
}
|
||||
|
||||
@@ -292,48 +235,32 @@ where
|
||||
/// Helper function for BatchedUpdates::insert_historic
|
||||
///
|
||||
/// TODO(chi): remove L generic so that we do not need to pass layer object.
|
||||
pub(self) fn insert_historic_noflush(
|
||||
&mut self,
|
||||
layer_desc: PersistentLayerDesc,
|
||||
layer: Arc<L>,
|
||||
) {
|
||||
self.mapping.insert(layer_desc.key(), layer.clone());
|
||||
|
||||
pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
|
||||
|
||||
if Self::is_l0(&layer) {
|
||||
if Self::is_l0(&layer_desc) {
|
||||
self.l0_delta_layers.push(layer_desc.clone().into());
|
||||
}
|
||||
|
||||
self.historic.insert(
|
||||
historic_layer_coverage::LayerKey::from(&*layer),
|
||||
historic_layer_coverage::LayerKey::from(&layer_desc),
|
||||
layer_desc.into(),
|
||||
);
|
||||
}
|
||||
|
||||
fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
|
||||
let layer = self
|
||||
.mapping
|
||||
.get(key)
|
||||
.with_context(|| format!("{key:?}"))
|
||||
.expect("inconsistent layer mapping");
|
||||
layer
|
||||
}
|
||||
|
||||
///
|
||||
/// Remove an on-disk layer from the map.
|
||||
///
|
||||
/// Helper function for BatchedUpdates::remove_historic
|
||||
///
|
||||
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
|
||||
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
self.historic
|
||||
.remove(historic_layer_coverage::LayerKey::from(&*layer));
|
||||
if Self::is_l0(&layer) {
|
||||
.remove(historic_layer_coverage::LayerKey::from(&layer_desc));
|
||||
let layer_key = layer_desc.key();
|
||||
if Self::is_l0(&layer_desc) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
|
||||
l0_delta_layers.retain(|other| {
|
||||
!Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
|
||||
});
|
||||
l0_delta_layers.retain(|other| other.key() != layer_key);
|
||||
self.l0_delta_layers = l0_delta_layers;
|
||||
// this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
|
||||
// there's a chance that the comparison fails at runtime due to it comparing (pointer,
|
||||
@@ -344,69 +271,6 @@ where
|
||||
"failed to locate removed historic layer from l0_delta_layers"
|
||||
);
|
||||
}
|
||||
self.mapping.remove(&layer_desc.key());
|
||||
}
|
||||
|
||||
pub(self) fn replace_historic_noflush(
|
||||
&mut self,
|
||||
expected_desc: PersistentLayerDesc,
|
||||
expected: &Arc<L>,
|
||||
new_desc: PersistentLayerDesc,
|
||||
new: Arc<L>,
|
||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
||||
let key = historic_layer_coverage::LayerKey::from(&**expected);
|
||||
let other = historic_layer_coverage::LayerKey::from(&*new);
|
||||
|
||||
let expected_l0 = Self::is_l0(expected);
|
||||
let new_l0 = Self::is_l0(&new);
|
||||
|
||||
anyhow::ensure!(
|
||||
key == other,
|
||||
"expected and new must have equal LayerKeys: {key:?} != {other:?}"
|
||||
);
|
||||
|
||||
anyhow::ensure!(
|
||||
expected_l0 == new_l0,
|
||||
"expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
|
||||
);
|
||||
|
||||
let l0_index = if expected_l0 {
|
||||
// find the index in case replace worked, we need to replace that as well
|
||||
let pos = self.l0_delta_layers.iter().position(|slot| {
|
||||
Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
|
||||
});
|
||||
|
||||
if pos.is_none() {
|
||||
return Ok(Replacement::NotFound);
|
||||
}
|
||||
pos
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let new_desc = Arc::new(new_desc);
|
||||
let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
|
||||
**existing == expected_desc
|
||||
});
|
||||
|
||||
if let Replacement::Replaced { .. } = &replaced {
|
||||
self.mapping.remove(&expected_desc.key());
|
||||
self.mapping.insert(new_desc.key(), new);
|
||||
if let Some(index) = l0_index {
|
||||
self.l0_delta_layers[index] = new_desc;
|
||||
}
|
||||
}
|
||||
|
||||
let replaced = match replaced {
|
||||
Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
|
||||
Replacement::NotFound => Replacement::NotFound,
|
||||
Replacement::RemovalBuffered => Replacement::RemovalBuffered,
|
||||
Replacement::Unexpected(x) => {
|
||||
Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
|
||||
}
|
||||
};
|
||||
|
||||
Ok(replaced)
|
||||
}
|
||||
|
||||
/// Helper function for BatchedUpdates::drop.
|
||||
@@ -454,10 +318,8 @@ where
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
|
||||
self.historic
|
||||
.iter()
|
||||
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
|
||||
self.historic.iter()
|
||||
}
|
||||
|
||||
///
|
||||
@@ -472,7 +334,7 @@ where
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.0) {
|
||||
Some(v) => v,
|
||||
None => return Ok(vec![]),
|
||||
@@ -482,37 +344,27 @@ where
|
||||
let end = key_range.end.to_i128();
|
||||
|
||||
// Initialize loop variables
|
||||
let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
|
||||
let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
|
||||
let mut current_key = start;
|
||||
let mut current_val = version.image_coverage.query(start);
|
||||
|
||||
// Loop through the change events and push intervals
|
||||
for (change_key, change_val) in version.image_coverage.range(start..end) {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||
coverage.push((
|
||||
kr,
|
||||
current_val
|
||||
.take()
|
||||
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
|
||||
));
|
||||
coverage.push((kr, current_val.take()));
|
||||
current_key = change_key;
|
||||
current_val = change_val.clone();
|
||||
}
|
||||
|
||||
// Add the final interval
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(end);
|
||||
coverage.push((
|
||||
kr,
|
||||
current_val
|
||||
.take()
|
||||
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
|
||||
));
|
||||
coverage.push((kr, current_val.take()));
|
||||
|
||||
Ok(coverage)
|
||||
}
|
||||
|
||||
pub fn is_l0(layer: &L) -> bool {
|
||||
range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
|
||||
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
|
||||
layer.get_key_range() == (Key::MIN..Key::MAX)
|
||||
}
|
||||
|
||||
/// This function determines which layers are counted in `count_deltas`:
|
||||
@@ -537,14 +389,14 @@ where
|
||||
/// TODO The optimal number should probably be slightly higher than 1, but to
|
||||
/// implement that we need to plumb a lot more context into this function
|
||||
/// than just the current partition_range.
|
||||
pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
|
||||
pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
|
||||
// Case 1
|
||||
if !Self::is_l0(layer) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Case 2
|
||||
if range_eq(partition_range, &(Key::MIN..Key::MAX)) {
|
||||
if partition_range == &(Key::MIN..Key::MAX) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -595,9 +447,7 @@ where
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||
let lr = lsn.start..val.get_lsn_range().start;
|
||||
if !kr.is_empty() {
|
||||
let base_count =
|
||||
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
|
||||
as usize;
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath =
|
||||
self.count_deltas(&kr, &lr, new_limit)?;
|
||||
@@ -620,9 +470,7 @@ where
|
||||
let lr = lsn.start..val.get_lsn_range().start;
|
||||
|
||||
if !kr.is_empty() {
|
||||
let base_count =
|
||||
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
|
||||
as usize;
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
@@ -772,12 +620,8 @@ where
|
||||
}
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
|
||||
Ok(self
|
||||
.l0_delta_layers
|
||||
.iter()
|
||||
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
|
||||
.collect())
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
|
||||
Ok(self.l0_delta_layers.to_vec())
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
@@ -802,72 +646,67 @@ where
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
|
||||
///
|
||||
/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
|
||||
#[inline(always)]
|
||||
pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
|
||||
// "dyn Trait" objects are "fat pointers" in that they have two components:
|
||||
// - pointer to the object
|
||||
// - pointer to the vtable
|
||||
//
|
||||
// rust does not provide a guarantee that these vtables are unique, but however
|
||||
// `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
|
||||
// pointer and the vtable need to be equal.
|
||||
//
|
||||
// See: https://github.com/rust-lang/rust/issues/103763
|
||||
//
|
||||
// A future version of rust will most likely use this form below, where we cast each
|
||||
// pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
|
||||
// not affect the comparison.
|
||||
//
|
||||
// See: https://github.com/rust-lang/rust/pull/106450
|
||||
let left = Arc::as_ptr(left) as *const ();
|
||||
let right = Arc::as_ptr(right) as *const ();
|
||||
|
||||
left == right
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{LayerMap, Replacement};
|
||||
use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
|
||||
use super::LayerMap;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
mod l0_delta_layers_updated {
|
||||
|
||||
use crate::tenant::{
|
||||
storage_layer::{AsLayerDesc, PersistentLayerDesc},
|
||||
timeline::layer_manager::LayerFileManager,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
struct LayerObject(PersistentLayerDesc);
|
||||
|
||||
impl AsLayerDesc for LayerObject {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerObject {
|
||||
fn new(desc: PersistentLayerDesc) -> Self {
|
||||
LayerObject(desc)
|
||||
}
|
||||
}
|
||||
|
||||
type TestLayerFileManager = LayerFileManager<LayerObject>;
|
||||
|
||||
#[test]
|
||||
fn for_full_range_delta() {
|
||||
// l0_delta_layers are used by compaction, and should observe all buffered updates
|
||||
l0_delta_layers_updated_scenario(
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
|
||||
true
|
||||
)
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
|
||||
true
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn for_non_full_range_delta() {
|
||||
// has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
|
||||
l0_delta_layers_updated_scenario(
|
||||
"000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
|
||||
// because not full range
|
||||
false
|
||||
)
|
||||
"000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
|
||||
// because not full range
|
||||
false
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn for_image() {
|
||||
l0_delta_layers_updated_scenario(
|
||||
"000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
|
||||
// code only checks if it is a full range layer, doesn't care about images, which must
|
||||
// mean we should in practice never have full range images
|
||||
false
|
||||
)
|
||||
"000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
|
||||
// code only checks if it is a full range layer, doesn't care about images, which must
|
||||
// mean we should in practice never have full range images
|
||||
false
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -877,75 +716,70 @@ mod tests {
|
||||
|
||||
let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
|
||||
let layer = LayerFileName::from_str(layer).unwrap();
|
||||
let layer = LayerDescriptor::from(layer);
|
||||
let layer = PersistentLayerDesc::from(layer);
|
||||
|
||||
// same skeletan construction; see scenario below
|
||||
let not_found = Arc::new(layer.clone());
|
||||
let new_version = Arc::new(layer);
|
||||
let not_found = Arc::new(LayerObject::new(layer.clone()));
|
||||
let new_version = Arc::new(LayerObject::new(layer));
|
||||
|
||||
let mut map = LayerMap::default();
|
||||
// after the immutable storage state refactor, the replace operation
|
||||
// will not use layer map any more. We keep it here for consistency in test cases
|
||||
// and can remove it in the future.
|
||||
let _map = LayerMap::default();
|
||||
|
||||
let res = map.batch_update().replace_historic(
|
||||
not_found.get_persistent_layer_desc(),
|
||||
¬_found,
|
||||
new_version.get_persistent_layer_desc(),
|
||||
new_version,
|
||||
);
|
||||
let mut mapping = TestLayerFileManager::new();
|
||||
|
||||
assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
|
||||
mapping
|
||||
.replace_and_verify(not_found, new_version)
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
|
||||
let name = LayerFileName::from_str(layer_name).unwrap();
|
||||
let skeleton = LayerDescriptor::from(name);
|
||||
let skeleton = PersistentLayerDesc::from(name);
|
||||
|
||||
let remote = Arc::new(skeleton.clone());
|
||||
let downloaded = Arc::new(skeleton);
|
||||
let remote = Arc::new(LayerObject::new(skeleton.clone()));
|
||||
let downloaded = Arc::new(LayerObject::new(skeleton));
|
||||
|
||||
let mut map = LayerMap::default();
|
||||
let mut mapping = LayerFileManager::new();
|
||||
|
||||
// two disjoint Arcs in different lifecycle phases. even if it seems they must be the
|
||||
// same layer, we use LayerMap::compare_arced_layers as the identity of layers.
|
||||
assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));
|
||||
assert_eq!(remote.layer_desc(), downloaded.layer_desc());
|
||||
|
||||
let expected_in_counts = (1, usize::from(expected_l0));
|
||||
|
||||
map.batch_update()
|
||||
.insert_historic(remote.get_persistent_layer_desc(), remote.clone());
|
||||
assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
|
||||
|
||||
let replaced = map
|
||||
.batch_update()
|
||||
.replace_historic(
|
||||
remote.get_persistent_layer_desc(),
|
||||
&remote,
|
||||
downloaded.get_persistent_layer_desc(),
|
||||
downloaded.clone(),
|
||||
)
|
||||
.expect("name derived attributes are the same");
|
||||
assert!(
|
||||
matches!(replaced, Replacement::Replaced { .. }),
|
||||
"{replaced:?}"
|
||||
.insert_historic(remote.layer_desc().clone());
|
||||
mapping.insert(remote.clone());
|
||||
assert_eq!(
|
||||
count_layer_in(&map, remote.layer_desc()),
|
||||
expected_in_counts
|
||||
);
|
||||
|
||||
mapping
|
||||
.replace_and_verify(remote, downloaded.clone())
|
||||
.expect("name derived attributes are the same");
|
||||
assert_eq!(
|
||||
count_layer_in(&map, downloaded.layer_desc()),
|
||||
expected_in_counts
|
||||
);
|
||||
assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);
|
||||
|
||||
map.batch_update()
|
||||
.remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
|
||||
assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
|
||||
.remove_historic(downloaded.layer_desc().clone());
|
||||
assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
|
||||
}
|
||||
|
||||
fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
|
||||
fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
|
||||
let historic = map
|
||||
.iter_historic_layers()
|
||||
.filter(|x| LayerMap::compare_arced_layers(x, layer))
|
||||
.filter(|x| x.key() == layer.key())
|
||||
.count();
|
||||
let l0s = map
|
||||
.get_level0_deltas()
|
||||
.expect("why does this return a result");
|
||||
let l0 = l0s
|
||||
.iter()
|
||||
.filter(|x| LayerMap::compare_arced_layers(x, layer))
|
||||
.count();
|
||||
let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
|
||||
|
||||
(historic, l0)
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@ use std::ops::Range;
|
||||
|
||||
use tracing::info;
|
||||
|
||||
use crate::tenant::storage_layer::PersistentLayerDesc;
|
||||
|
||||
use super::layer_coverage::LayerCoverageTuple;
|
||||
|
||||
/// Layers in this module are identified and indexed by this data.
|
||||
@@ -41,8 +43,8 @@ impl Ord for LayerKey {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerKey {
|
||||
fn from(layer: &'a L) -> Self {
|
||||
impl From<&PersistentLayerDesc> for LayerKey {
|
||||
fn from(layer: &PersistentLayerDesc) -> Self {
|
||||
let kr = layer.get_key_range();
|
||||
let lr = layer.get_lsn_range();
|
||||
LayerKey {
|
||||
@@ -120,8 +122,7 @@ impl<Value: Clone> HistoricLayerCoverage<Value> {
|
||||
self.head = self
|
||||
.historic
|
||||
.iter()
|
||||
.rev()
|
||||
.next()
|
||||
.next_back()
|
||||
.map(|(_, v)| v.clone())
|
||||
.unwrap_or_default();
|
||||
}
|
||||
@@ -410,7 +411,7 @@ fn test_persistent_overlapping() {
|
||||
/// still be more critical.
|
||||
///
|
||||
/// See this for more on persistent and retroactive techniques:
|
||||
/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
||||
/// <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
|
||||
pub struct BufferedHistoricLayerCoverage<Value> {
|
||||
/// A persistent layer map that we rebuild when we need to retroactively update
|
||||
historic_coverage: HistoricLayerCoverage<Value>,
|
||||
@@ -454,59 +455,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
self.buffer.insert(layer_key, None);
|
||||
}
|
||||
|
||||
/// Replaces a previous layer with a new layer value.
|
||||
///
|
||||
/// The replacement is conditional on:
|
||||
/// - there is an existing `LayerKey` record
|
||||
/// - there is no buffered removal for the given `LayerKey`
|
||||
/// - the given closure returns true for the current `Value`
|
||||
///
|
||||
/// The closure is used to compare the latest value (buffered insert, or existing layer)
|
||||
/// against some expectation. This allows to use `Arc::ptr_eq` or similar which would be
|
||||
/// inaccessible via `PartialEq` trait.
|
||||
///
|
||||
/// Returns a `Replacement` value describing the outcome; only the case of
|
||||
/// `Replacement::Replaced` modifies the map and requires a rebuild.
|
||||
pub fn replace<F>(
|
||||
&mut self,
|
||||
layer_key: &LayerKey,
|
||||
new: Value,
|
||||
check_expected: F,
|
||||
) -> Replacement<Value>
|
||||
where
|
||||
F: FnOnce(&Value) -> bool,
|
||||
{
|
||||
let (slot, in_buffered) = match self.buffer.get(layer_key) {
|
||||
Some(inner @ Some(_)) => {
|
||||
// we compare against the buffered version, because there will be a later
|
||||
// rebuild before querying
|
||||
(inner.as_ref(), true)
|
||||
}
|
||||
Some(None) => {
|
||||
// buffer has removal for this key; it will not be equivalent by any check_expected.
|
||||
return Replacement::RemovalBuffered;
|
||||
}
|
||||
None => {
|
||||
// no pending modification for the key, check layers
|
||||
(self.layers.get(layer_key), false)
|
||||
}
|
||||
};
|
||||
|
||||
match slot {
|
||||
Some(existing) if !check_expected(existing) => {
|
||||
// unfortunate clone here, but otherwise the nll borrowck grows the region of
|
||||
// 'a to cover the whole function, and we could not mutate in the other
|
||||
// Some(existing) branch
|
||||
Replacement::Unexpected(existing.clone())
|
||||
}
|
||||
None => Replacement::NotFound,
|
||||
Some(_existing) => {
|
||||
self.insert(layer_key.to_owned(), new);
|
||||
Replacement::Replaced { in_buffered }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn rebuild(&mut self) {
|
||||
// Find the first LSN that needs to be rebuilt
|
||||
let rebuild_since: u64 = match self.buffer.iter().next() {
|
||||
@@ -575,22 +523,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Outcome of the replace operation.
|
||||
#[derive(Debug)]
|
||||
pub enum Replacement<Value> {
|
||||
/// Previous value was replaced with the new value.
|
||||
Replaced {
|
||||
/// Replacement happened for a scheduled insert.
|
||||
in_buffered: bool,
|
||||
},
|
||||
/// Key was not found buffered updates or existing layers.
|
||||
NotFound,
|
||||
/// Key has been scheduled for removal, it was not replaced.
|
||||
RemovalBuffered,
|
||||
/// Previous value was rejected by the closure.
|
||||
Unexpected(Value),
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_retroactive_regression_1() {
|
||||
let mut map = BufferedHistoricLayerCoverage::new();
|
||||
@@ -699,139 +631,3 @@ fn test_retroactive_simple() {
|
||||
assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_retroactive_replacement() {
|
||||
let mut map = BufferedHistoricLayerCoverage::new();
|
||||
|
||||
let keys = [
|
||||
LayerKey {
|
||||
key: 0..5,
|
||||
lsn: 100..101,
|
||||
is_image: true,
|
||||
},
|
||||
LayerKey {
|
||||
key: 3..9,
|
||||
lsn: 110..111,
|
||||
is_image: true,
|
||||
},
|
||||
LayerKey {
|
||||
key: 4..6,
|
||||
lsn: 120..121,
|
||||
is_image: true,
|
||||
},
|
||||
];
|
||||
|
||||
let layers = [
|
||||
"Image 1".to_string(),
|
||||
"Image 2".to_string(),
|
||||
"Image 3".to_string(),
|
||||
];
|
||||
|
||||
for (key, layer) in keys.iter().zip(layers.iter()) {
|
||||
map.insert(key.to_owned(), layer.to_owned());
|
||||
}
|
||||
|
||||
// rebuild is not necessary here, because replace works for both buffered updates and existing
|
||||
// layers.
|
||||
|
||||
for (key, orig_layer) in keys.iter().zip(layers.iter()) {
|
||||
let replacement = format!("Remote {orig_layer}");
|
||||
|
||||
// evict
|
||||
let ret = map.replace(key, replacement.clone(), |l| l == orig_layer);
|
||||
assert!(
|
||||
matches!(ret, Replacement::Replaced { .. }),
|
||||
"replace {orig_layer}: {ret:?}"
|
||||
);
|
||||
map.rebuild();
|
||||
|
||||
let at = key.lsn.end + 1;
|
||||
|
||||
let version = map.get().expect("rebuilt").get_version(at).unwrap();
|
||||
assert_eq!(
|
||||
version.image_coverage.query(4).as_deref(),
|
||||
Some(replacement.as_str()),
|
||||
"query for 4 at version {at} after eviction",
|
||||
);
|
||||
|
||||
// download
|
||||
let ret = map.replace(key, orig_layer.clone(), |l| l == &replacement);
|
||||
assert!(
|
||||
matches!(ret, Replacement::Replaced { .. }),
|
||||
"replace {orig_layer} back: {ret:?}"
|
||||
);
|
||||
map.rebuild();
|
||||
let version = map.get().expect("rebuilt").get_version(at).unwrap();
|
||||
assert_eq!(
|
||||
version.image_coverage.query(4).as_deref(),
|
||||
Some(orig_layer.as_str()),
|
||||
"query for 4 at version {at} after download",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn missing_key_is_not_inserted_with_replace() {
|
||||
let mut map = BufferedHistoricLayerCoverage::new();
|
||||
let key = LayerKey {
|
||||
key: 0..5,
|
||||
lsn: 100..101,
|
||||
is_image: true,
|
||||
};
|
||||
|
||||
let ret = map.replace(&key, "should not replace", |_| true);
|
||||
assert!(matches!(ret, Replacement::NotFound), "{ret:?}");
|
||||
map.rebuild();
|
||||
assert!(map
|
||||
.get()
|
||||
.expect("no changes to rebuild")
|
||||
.get_version(102)
|
||||
.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replacing_buffered_insert_and_remove() {
|
||||
let mut map = BufferedHistoricLayerCoverage::new();
|
||||
let key = LayerKey {
|
||||
key: 0..5,
|
||||
lsn: 100..101,
|
||||
is_image: true,
|
||||
};
|
||||
|
||||
map.insert(key.clone(), "Image 1");
|
||||
let ret = map.replace(&key, "Remote Image 1", |&l| l == "Image 1");
|
||||
assert!(
|
||||
matches!(ret, Replacement::Replaced { in_buffered: true }),
|
||||
"{ret:?}"
|
||||
);
|
||||
map.rebuild();
|
||||
|
||||
assert_eq!(
|
||||
map.get()
|
||||
.expect("rebuilt")
|
||||
.get_version(102)
|
||||
.unwrap()
|
||||
.image_coverage
|
||||
.query(4),
|
||||
Some("Remote Image 1")
|
||||
);
|
||||
|
||||
map.remove(key.clone());
|
||||
let ret = map.replace(&key, "should not replace", |_| true);
|
||||
assert!(
|
||||
matches!(ret, Replacement::RemovalBuffered),
|
||||
"cannot replace after scheduled remove: {ret:?}"
|
||||
);
|
||||
|
||||
map.rebuild();
|
||||
|
||||
let ret = map.replace(&key, "should not replace", |_| true);
|
||||
assert!(
|
||||
matches!(ret, Replacement::NotFound),
|
||||
"cannot replace after remove + rebuild: {ret:?}"
|
||||
);
|
||||
|
||||
let at_version = map.get().expect("rebuilt").get_version(102);
|
||||
assert!(at_version.is_none());
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::ops::Range;
|
||||
|
||||
// NOTE the `im` crate has 20x more downloads and also has
|
||||
// persistent/immutable BTree. But it's bugged so rpds is a
|
||||
// better choice https://github.com/neondatabase/neon/issues/3395
|
||||
// better choice <https://github.com/neondatabase/neon/issues/3395>
|
||||
use rpds::RedBlackTreeMapSync;
|
||||
|
||||
/// Data structure that can efficiently:
|
||||
@@ -11,7 +11,7 @@ use rpds::RedBlackTreeMapSync;
|
||||
/// - insert layers in non-decreasing lsn.start order
|
||||
///
|
||||
/// For a detailed explanation and justification of this approach, see:
|
||||
/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
|
||||
/// <https://neon.tech/blog/persistent-structures-in-neons-wal-indexing>
|
||||
///
|
||||
/// NOTE The struct is parameterized over Value for easier
|
||||
/// testing, but in practice it's some sort of layer.
|
||||
@@ -113,8 +113,7 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
pub fn query(&self, key: i128) -> Option<Value> {
|
||||
self.nodes
|
||||
.range(..=key)
|
||||
.rev()
|
||||
.next()?
|
||||
.next_back()?
|
||||
.1
|
||||
.as_ref()
|
||||
.map(|(_, v)| v.clone())
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
//! Currently, this is not used in the system. Future refactors will ensure
|
||||
//! the storage state will be recorded in this file, and the system can be
|
||||
//! recovered from this file. This is tracked in
|
||||
//! https://github.com/neondatabase/neon/issues/4418
|
||||
//! <https://github.com/neondatabase/neon/issues/4418>
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
//! Every image of a certain timeline from [`crate::tenant::Tenant`]
|
||||
//! has a metadata that needs to be stored persistently.
|
||||
//!
|
||||
//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
|
||||
//! Later, the file gets used in [`remote_timeline_client`] as a part of
|
||||
//! external storage import and export operations.
|
||||
//!
|
||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||
//!
|
||||
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
@@ -232,13 +234,13 @@ impl TimelineMetadata {
|
||||
/// Save timeline metadata to file
|
||||
pub fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
data: &TimelineMetadata,
|
||||
first_save: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter = info_span!("saving metadata").entered();
|
||||
let path = conf.metadata_path(timeline_id, tenant_id);
|
||||
let path = conf.metadata_path(tenant_id, timeline_id);
|
||||
// use OpenOptions to ensure file presence is consistent with first_save
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
@@ -267,10 +269,10 @@ pub fn save_metadata(
|
||||
|
||||
pub fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
let metadata_path = conf.metadata_path(timeline_id, tenant_id);
|
||||
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
|
||||
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata bytes from path {}",
|
||||
|
||||
@@ -184,9 +184,9 @@ pub fn schedule_local_tenant_processing(
|
||||
format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
|
||||
})?;
|
||||
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||
anyhow::ensure!(
|
||||
!conf.tenant_ignore_mark_file_path(tenant_id).exists(),
|
||||
!conf.tenant_ignore_mark_file_path(&tenant_id).exists(),
|
||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||
);
|
||||
|
||||
@@ -310,7 +310,7 @@ pub async fn create_tenant(
|
||||
// We're holding the tenants lock in write mode while doing local IO.
|
||||
// If this section ever becomes contentious, introduce a new `TenantState::Creating`
|
||||
// and do the work in that state.
|
||||
let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Create)?;
|
||||
let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?;
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -344,14 +344,9 @@ pub async fn set_new_tenant_config(
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
|
||||
let tenant_config_path = conf.tenant_config_path(tenant_id);
|
||||
Tenant::persist_tenant_config(
|
||||
&tenant.tenant_id(),
|
||||
&tenant_config_path,
|
||||
new_tenant_conf,
|
||||
false,
|
||||
)
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
let tenant_config_path = conf.tenant_config_path(&tenant_id);
|
||||
Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false)
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
Ok(())
|
||||
}
|
||||
@@ -435,7 +430,7 @@ pub async fn detach_tenant(
|
||||
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
||||
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
||||
if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||
if tenant_ignore_mark.exists() {
|
||||
info!("Detaching an ignored tenant");
|
||||
local_files_cleanup_operation(tenant_id)
|
||||
@@ -457,7 +452,7 @@ pub async fn load_tenant(
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
tenant_map_insert(tenant_id, || {
|
||||
let tenant_path = conf.tenant_path(&tenant_id);
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||
if tenant_ignore_mark.exists() {
|
||||
std::fs::remove_file(&tenant_ignore_mark)
|
||||
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
||||
@@ -478,7 +473,7 @@ pub async fn ignore_tenant(
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||
fs::File::create(&ignore_mark_file)
|
||||
.await
|
||||
.context("Failed to create ignore mark file")
|
||||
@@ -525,7 +520,7 @@ pub async fn attach_tenant(
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
tenant_map_insert(tenant_id, || {
|
||||
let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?;
|
||||
let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -695,7 +690,7 @@ pub async fn immediate_gc(
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.instrument(info_span!("manual_gc", %tenant_id, %timeline_id))
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
@@ -745,9 +740,7 @@ pub async fn immediate_compact(
|
||||
async move {
|
||||
let result = timeline
|
||||
.compact(&ctx)
|
||||
.instrument(
|
||||
info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
|
||||
)
|
||||
.instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
|
||||
.await;
|
||||
|
||||
match task_done.send(result) {
|
||||
|
||||
@@ -135,7 +135,7 @@
|
||||
//! - Initiate upload queue with that [`IndexPart`].
|
||||
//! - Reschedule all lost operations by comparing the local filesystem state
|
||||
//! and remote state as per [`IndexPart`]. This is done in
|
||||
//! [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
||||
//! [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
||||
//!
|
||||
//! Note that if we crash during file deletion between the index update
|
||||
//! that removes the file from the list of files, and deleting the remote file,
|
||||
@@ -163,8 +163,8 @@
|
||||
//! - download their remote [`IndexPart`]s
|
||||
//! - create `Timeline` struct and a `RemoteTimelineClient`
|
||||
//! - initialize the client's upload queue with its `IndexPart`
|
||||
//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
|
||||
//! but not present locally
|
||||
//! - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
|
||||
//! for layers that are referenced by `IndexPart` but not present locally
|
||||
//! - schedule uploads for layers that are only present locally.
|
||||
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
|
||||
//! the local filesystem, write the remote metadata to the local filesystem
|
||||
@@ -198,6 +198,8 @@
|
||||
//! in remote storage.
|
||||
//! But note that we don't test any of this right now.
|
||||
//!
|
||||
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||
//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
|
||||
|
||||
mod delete;
|
||||
mod download;
|
||||
@@ -442,8 +444,8 @@ impl RemoteTimelineClient {
|
||||
let index_part = download::download_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
@@ -608,10 +610,7 @@ impl RemoteTimelineClient {
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
|
||||
info!(
|
||||
"scheduled layer file upload {}",
|
||||
layer_file_name.file_name()
|
||||
);
|
||||
info!("scheduled layer file upload {layer_file_name}");
|
||||
|
||||
// Launch the task immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
@@ -664,7 +663,7 @@ impl RemoteTimelineClient {
|
||||
});
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
info!("scheduled layer file deletion {}", name.file_name());
|
||||
info!("scheduled layer file deletion {name}");
|
||||
}
|
||||
|
||||
// Launch the tasks immediately, if possible
|
||||
@@ -751,25 +750,13 @@ impl RemoteTimelineClient {
|
||||
stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
|
||||
});
|
||||
|
||||
// Have a failpoint that can use the `pause` failpoint action.
|
||||
// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint persist_deleted_index_part");
|
||||
fail::fail_point!("persist_deleted_index_part");
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
pausable_failpoint!("persist_deleted_index_part");
|
||||
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&index_part_with_deleted_at,
|
||||
)
|
||||
.await?;
|
||||
@@ -828,7 +815,7 @@ impl RemoteTimelineClient {
|
||||
.queued_operations
|
||||
.push_back(op);
|
||||
|
||||
info!("scheduled layer file deletion {}", name.file_name());
|
||||
info!("scheduled layer file deletion {name}");
|
||||
deletions_queued += 1;
|
||||
}
|
||||
|
||||
@@ -844,7 +831,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
||||
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
|
||||
|
||||
let remaining = self
|
||||
@@ -855,17 +842,17 @@ impl RemoteTimelineClient {
|
||||
let remaining: Vec<RemotePath> = remaining
|
||||
.into_iter()
|
||||
.filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
|
||||
.inspect(|path| {
|
||||
if let Some(name) = path.object_name() {
|
||||
info!(%name, "deleting a file not referenced from index_part.json");
|
||||
} else {
|
||||
warn!(%path, "deleting a nameless or non-utf8 object not referenced from index_part.json");
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !remaining.is_empty() {
|
||||
warn!(
|
||||
"Found {} files not bound to index_file.json, proceeding with their deletion",
|
||||
remaining.len()
|
||||
);
|
||||
for file in remaining {
|
||||
warn!("Removing {}", file.object_name().unwrap_or_default());
|
||||
self.storage_impl.delete(&file).await?;
|
||||
}
|
||||
self.storage_impl.delete_objects(&remaining).await?;
|
||||
}
|
||||
|
||||
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
|
||||
@@ -873,7 +860,7 @@ impl RemoteTimelineClient {
|
||||
debug!("deleting index part");
|
||||
self.storage_impl.delete(&index_file_path).await?;
|
||||
|
||||
info!(deletions_queued, "done deleting, including index_part.json");
|
||||
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -938,11 +925,11 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Assign unique ID to this task
|
||||
upload_queue.task_counter += 1;
|
||||
let task_id = upload_queue.task_counter;
|
||||
let upload_task_id = upload_queue.task_counter;
|
||||
|
||||
// Add it to the in-progress map
|
||||
let task = Arc::new(UploadTask {
|
||||
task_id,
|
||||
task_id: upload_task_id,
|
||||
op: next_op,
|
||||
retries: AtomicU32::new(0),
|
||||
});
|
||||
@@ -952,6 +939,8 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Spawn task to perform the task
|
||||
let self_rc = Arc::clone(self);
|
||||
let tenant_id = self.tenant_id;
|
||||
let timeline_id = self.timeline_id;
|
||||
task_mgr::spawn(
|
||||
self.runtime.handle(),
|
||||
TaskKind::RemoteUploadTask,
|
||||
@@ -963,7 +952,7 @@ impl RemoteTimelineClient {
|
||||
self_rc.perform_upload_task(task).await;
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!(parent: None, "remote_upload", tenant = %self.tenant_id, timeline = %self.timeline_id, upload_task_id = %task_id)),
|
||||
.instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)),
|
||||
);
|
||||
|
||||
// Loop back to process next task
|
||||
@@ -1008,7 +997,7 @@ impl RemoteTimelineClient {
|
||||
UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
|
||||
let path = &self
|
||||
.conf
|
||||
.timeline_path(&self.timeline_id, &self.tenant_id)
|
||||
.timeline_path(&self.tenant_id, &self.timeline_id)
|
||||
.join(layer_file_name.file_name());
|
||||
upload::upload_timeline_layer(
|
||||
self.conf,
|
||||
@@ -1029,8 +1018,8 @@ impl RemoteTimelineClient {
|
||||
let res = upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
index_part,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -1049,7 +1038,7 @@ impl RemoteTimelineClient {
|
||||
UploadOp::Delete(delete) => {
|
||||
let path = &self
|
||||
.conf
|
||||
.timeline_path(&self.timeline_id, &self.tenant_id)
|
||||
.timeline_path(&self.tenant_id, &self.timeline_id)
|
||||
.join(delete.layer_file_name.file_name());
|
||||
delete::delete_layer(self.conf, &self.storage_impl, path)
|
||||
.measure_remote_op(
|
||||
|
||||
@@ -19,9 +19,10 @@ pub(super) async fn delete_layer<'a>(
|
||||
|
||||
let path_to_delete = conf.remote_path(local_layer_path)?;
|
||||
|
||||
// XXX: If the deletion fails because the object already didn't exist,
|
||||
// it would be good to just issue a warning but consider it success.
|
||||
// https://github.com/neondatabase/neon/issues/2934
|
||||
// We don't want to print an error if the delete failed if the file has
|
||||
// already been deleted. Thankfully, in this situation S3 already
|
||||
// does not yield an error. While OS-provided local file system APIs do yield
|
||||
// errors, we avoid them in the `LocalFs` wrapper.
|
||||
storage.delete(&path_to_delete).await.with_context(|| {
|
||||
format!("Failed to delete remote layer from storage at {path_to_delete:?}")
|
||||
})
|
||||
|
||||
@@ -16,7 +16,7 @@ use tracing::{info, warn};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
@@ -46,7 +46,7 @@ pub async fn download_layer_file<'a>(
|
||||
) -> Result<u64, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
let timeline_path = conf.timeline_path(&tenant_id, &timeline_id);
|
||||
|
||||
let local_path = timeline_path.join(layer_file_name.file_name());
|
||||
|
||||
@@ -229,11 +229,11 @@ pub async fn list_remote_timelines<'a>(
|
||||
pub(super) async fn download_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let index_part_path = conf
|
||||
.metadata_path(timeline_id, tenant_id)
|
||||
.metadata_path(tenant_id, timeline_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let part_storage_path = conf
|
||||
.remote_path(&index_part_path)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use fail::fail_point;
|
||||
use std::path::Path;
|
||||
use std::{io::ErrorKind, path::Path};
|
||||
use tokio::fs;
|
||||
|
||||
use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
|
||||
@@ -11,12 +11,14 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::LayerFileMetadata;
|
||||
|
||||
use tracing::info;
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
index_part: &'a IndexPart,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::trace!("uploading new index part");
|
||||
@@ -31,7 +33,7 @@ pub(super) async fn upload_index_part<'a>(
|
||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
||||
|
||||
let index_part_path = conf
|
||||
.metadata_path(timeline_id, tenant_id)
|
||||
.metadata_path(tenant_id, timeline_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let storage_path = conf.remote_path(&index_part_path)?;
|
||||
|
||||
@@ -56,9 +58,21 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
});
|
||||
let storage_path = conf.remote_path(source_path)?;
|
||||
|
||||
let source_file = fs::File::open(&source_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?;
|
||||
let source_file_res = fs::File::open(&source_path).await;
|
||||
let source_file = match source_file_res {
|
||||
Ok(source_file) => source_file,
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// If we encounter this arm, it wasn't intended, but it's also not
|
||||
// a big problem, if it's because the file was deleted before an
|
||||
// upload. However, a nonexistent file can also be indicative of
|
||||
// something worse, like when a file is scheduled for upload before
|
||||
// it has been written to disk yet.
|
||||
info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => Err(e)
|
||||
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?,
|
||||
};
|
||||
|
||||
let fs_size = source_file
|
||||
.metadata()
|
||||
|
||||
@@ -110,11 +110,11 @@ pub struct TimelineInputs {
|
||||
///
|
||||
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
||||
/// is updated on-demand, during the start of this calculation and separate from the
|
||||
/// [`Timeline::latest_gc_cutoff`].
|
||||
/// [`TimelineInputs::latest_gc_cutoff`].
|
||||
///
|
||||
/// For timelines in general:
|
||||
///
|
||||
/// ```ignore
|
||||
/// ```text
|
||||
/// 0-----|---------|----|------------| · · · · · |·> lsn
|
||||
/// initdb_lsn branchpoints* next_gc_cutoff latest
|
||||
/// ```
|
||||
|
||||
17
pageserver/src/tenant/span.rs
Normal file
17
pageserver/src/tenant/span.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
#[cfg(debug_assertions)]
|
||||
use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
||||
|
||||
#[cfg(not(debug_assertions))]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
|
||||
once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"]));
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
#[track_caller]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {
|
||||
if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
|
||||
panic!("missing extractors: {missing:?}")
|
||||
}
|
||||
}
|
||||
@@ -12,7 +12,7 @@ use crate::context::RequestContext;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use enum_map::EnumMap;
|
||||
use enumset::EnumSet;
|
||||
@@ -41,7 +41,7 @@ pub use inmemory_layer::InMemoryLayer;
|
||||
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
pub use remote_layer::RemoteLayer;
|
||||
|
||||
use super::layer_map::BatchedUpdates;
|
||||
use super::timeline::layer_manager::LayerManager;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
@@ -54,13 +54,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub fn range_eq<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialEq<T>,
|
||||
{
|
||||
a.start == b.start && a.end == b.end
|
||||
}
|
||||
|
||||
/// Struct used to communicate across calls to 'get_value_reconstruct_data'.
|
||||
///
|
||||
/// Before first call, you can fill in 'page_img' if you have an older cached
|
||||
@@ -169,6 +162,9 @@ impl LayerAccessStats {
|
||||
/// The caller is responsible for recording a residence event
|
||||
/// using [`record_residence_event`] before calling `latest_activity`.
|
||||
/// If they don't, [`latest_activity`] will return `None`.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
/// [`latest_activity`]: Self::latest_activity
|
||||
pub(crate) fn empty_will_record_residence_event_later() -> Self {
|
||||
LayerAccessStats(Mutex::default())
|
||||
}
|
||||
@@ -176,13 +172,13 @@ impl LayerAccessStats {
|
||||
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
pub(crate) fn for_loading_layer<L>(
|
||||
layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
|
||||
///
|
||||
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn for_loading_layer(
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
status: LayerResidenceStatus,
|
||||
) -> Self
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
) -> Self {
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||
new.record_residence_event(
|
||||
layer_map_lock_held_witness,
|
||||
@@ -197,14 +193,13 @@ impl LayerAccessStats {
|
||||
/// The `new_status` is not recorded in `self`.
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
pub(crate) fn clone_for_residence_change<L>(
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn clone_for_residence_change(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
new_status: LayerResidenceStatus,
|
||||
) -> LayerAccessStats
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
) -> LayerAccessStats {
|
||||
let clone = {
|
||||
let inner = self.0.lock().unwrap();
|
||||
inner.clone()
|
||||
@@ -232,14 +227,12 @@ impl LayerAccessStats {
|
||||
/// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
|
||||
/// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
|
||||
///
|
||||
pub(crate) fn record_residence_event<L>(
|
||||
pub(crate) fn record_residence_event(
|
||||
&self,
|
||||
_layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
|
||||
_layer_map_lock_held_witness: &LayerManager,
|
||||
status: LayerResidenceStatus,
|
||||
reason: LayerResidenceEventReason,
|
||||
) where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
) {
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
locked.iter_mut().for_each(|inner| {
|
||||
inner
|
||||
@@ -309,11 +302,13 @@ impl LayerAccessStats {
|
||||
/// implementation error. This function logs a rate-limited warning in that case.
|
||||
///
|
||||
/// TODO: use type system to avoid the need for `fallback`.
|
||||
/// The approach in https://github.com/neondatabase/neon/pull/3775
|
||||
/// The approach in <https://github.com/neondatabase/neon/pull/3775>
|
||||
/// could be used to enforce that a residence event is recorded
|
||||
/// before a layer is added to the layer map. We could also have
|
||||
/// a layer wrapper type that holds the LayerAccessStats, and ensure
|
||||
/// that that type can only be produced by inserting into the layer map.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
@@ -338,13 +333,12 @@ impl LayerAccessStats {
|
||||
}
|
||||
|
||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||
/// required by [`LayerMap`].
|
||||
/// required by [`LayerMap`](super::layer_map::LayerMap).
|
||||
///
|
||||
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
|
||||
/// timeline names, because those are known in the context of which the layers
|
||||
/// are used in (timeline).
|
||||
#[async_trait::async_trait]
|
||||
pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
|
||||
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
@@ -374,56 +368,30 @@ pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
|
||||
/// is available. If this returns ValueReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)>;
|
||||
|
||||
/// CANCEL SAFETY: if the returned future is dropped,
|
||||
/// the wrapped closure still run to completion and the return value discarded.
|
||||
/// For the case of get_value_reconstruct_data, we expect the closure to not
|
||||
/// have any side effects, as it only attempts to read a layer (and stuff like
|
||||
/// page cache isn't considered a real side effect).
|
||||
/// But, ...
|
||||
/// TRACING:
|
||||
/// If the returned future is cancelled, the spawn_blocking span can outlive
|
||||
/// the caller's span.
|
||||
/// So, technically, we should be using `parent: None` and `follows_from: current`
|
||||
/// instead. However, in practice, the advantage of maintaining the span stack
|
||||
/// in logs outweighs the disadvantage of having a dangling span in a case that
|
||||
/// is not expected to happen because in pageserver we generally don't drop pending futures.
|
||||
async fn get_value_reconstruct_data(
|
||||
self: Arc<Self>,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
let span = tracing::info_span!("get_value_reconstruct_data_spawn_blocking");
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = span.enter();
|
||||
self.get_value_reconstruct_data_blocking(key, lsn_range, reconstruct_data, ctx)
|
||||
})
|
||||
.await
|
||||
.context("spawn_blocking")?
|
||||
}
|
||||
|
||||
/// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
|
||||
fn short_id(&self) -> String;
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Returned by [`Layer::iter`]
|
||||
/// Returned by [`PersistentLayer::iter`]
|
||||
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
|
||||
|
||||
/// Returned by [`Layer::key_iter`]
|
||||
/// Returned by [`PersistentLayer::key_iter`]
|
||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
pub trait AsLayerDesc {
|
||||
/// Get the layer descriptor.
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc;
|
||||
}
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
@@ -437,10 +405,8 @@ pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send
|
||||
/// A delta layer contains all modifications within a range of LSNs and keys.
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN.
|
||||
pub trait PersistentLayer: Layer {
|
||||
/// Get the layer descriptor.
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc;
|
||||
|
||||
pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
/// Identify the tenant this layer belongs to
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.layer_desc().tenant_id
|
||||
}
|
||||
@@ -503,95 +469,40 @@ pub fn downcast_remote_layer(
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds metadata about a layer without any content. Used mostly for testing.
|
||||
///
|
||||
/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
|
||||
/// LayerDescriptor.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LayerDescriptor {
|
||||
pub key: Range<Key>,
|
||||
pub lsn: Range<Lsn>,
|
||||
pub is_incremental: bool,
|
||||
pub short_id: String,
|
||||
}
|
||||
pub mod tests {
|
||||
use super::*;
|
||||
|
||||
impl LayerDescriptor {
|
||||
/// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
|
||||
/// and the tenant / timeline id does not matter.
|
||||
pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
|
||||
PersistentLayerDesc::new_delta(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
self.key.clone(),
|
||||
self.lsn.clone(),
|
||||
233,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for LayerDescriptor {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key.clone()
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn.clone()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.is_incremental
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_data: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
todo!("This method shouldn't be part of the Layer trait")
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
self.short_id.clone()
|
||||
}
|
||||
|
||||
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DeltaFileName> for LayerDescriptor {
|
||||
fn from(value: DeltaFileName) -> Self {
|
||||
let short_id = value.to_string();
|
||||
LayerDescriptor {
|
||||
key: value.key_range,
|
||||
lsn: value.lsn_range,
|
||||
is_incremental: true,
|
||||
short_id,
|
||||
impl From<DeltaFileName> for PersistentLayerDesc {
|
||||
fn from(value: DeltaFileName) -> Self {
|
||||
PersistentLayerDesc::new_delta(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
value.key_range,
|
||||
value.lsn_range,
|
||||
233,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImageFileName> for LayerDescriptor {
|
||||
fn from(value: ImageFileName) -> Self {
|
||||
let short_id = value.to_string();
|
||||
let lsn = value.lsn_as_range();
|
||||
LayerDescriptor {
|
||||
key: value.key_range,
|
||||
lsn,
|
||||
is_incremental: false,
|
||||
short_id,
|
||||
impl From<ImageFileName> for PersistentLayerDesc {
|
||||
fn from(value: ImageFileName) -> Self {
|
||||
PersistentLayerDesc::new_img(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
value.key_range,
|
||||
value.lsn,
|
||||
false,
|
||||
233,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LayerFileName> for LayerDescriptor {
|
||||
fn from(value: LayerFileName) -> Self {
|
||||
match value {
|
||||
LayerFileName::Delta(d) => Self::from(d),
|
||||
LayerFileName::Image(i) => Self::from(i),
|
||||
impl From<LayerFileName> for PersistentLayerDesc {
|
||||
fn from(value: LayerFileName) -> Self {
|
||||
match value {
|
||||
LayerFileName::Delta(d) => Self::from(d),
|
||||
LayerFileName::Image(i) => Self::from(i),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,14 +7,18 @@
|
||||
//! must be page images or WAL records with the 'will_init' flag set, so that
|
||||
//! they can be replayed without referring to an older page version.
|
||||
//!
|
||||
//! The delta files are stored in timelines/<timeline_id> directory. Currently,
|
||||
//! The delta files are stored in `timelines/<timeline_id>` directory. Currently,
|
||||
//! there are no subdirectories, and each delta file is named like this:
|
||||
//!
|
||||
//! <key start>-<key end>__<start LSN>-<end LSN
|
||||
//! ```text
|
||||
//! <key start>-<key end>__<start LSN>-<end LSN>
|
||||
//! ```
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! ```text
|
||||
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
//! ```
|
||||
//!
|
||||
//! Every delta file consists of three parts: "summary", "index", and
|
||||
//! "values". The summary is a fixed size header at the beginning of the file,
|
||||
@@ -56,8 +60,8 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
||||
PathOrConf, PersistentLayerDesc,
|
||||
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||
LayerKeyIter, PathOrConf, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -218,18 +222,18 @@ impl std::fmt::Debug for DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.desc.lsn_range.start,
|
||||
self.desc.lsn_range.end
|
||||
self.desc.lsn_range.end,
|
||||
self.desc.file_size,
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
@@ -295,13 +299,13 @@ impl Layer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.desc.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
|
||||
@@ -309,7 +313,7 @@ impl Layer for DeltaLayer {
|
||||
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = &inner.file;
|
||||
@@ -375,9 +379,9 @@ impl Layer for DeltaLayer {
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Continue))
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -395,18 +399,21 @@ impl Layer for DeltaLayer {
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
}
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for DeltaLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn short_id(&self) -> String {
|
||||
self.layer_desc().short_id()
|
||||
impl AsLayerDesc for DeltaLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
@@ -458,22 +465,22 @@ impl PersistentLayer for DeltaLayer {
|
||||
impl DeltaLayer {
|
||||
fn path_for(
|
||||
path_or_conf: &PathOrConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
fname: &DeltaFileName,
|
||||
) -> PathBuf {
|
||||
match path_or_conf {
|
||||
PathOrConf::Path(path) => path.clone(),
|
||||
PathOrConf::Conf(conf) => conf
|
||||
.timeline_path(&timeline_id, &tenant_id)
|
||||
.timeline_path(tenant_id, timeline_id)
|
||||
.join(fname.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
fn temp_path_for(
|
||||
conf: &PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
key_start: Key,
|
||||
lsn_range: &Range<Lsn>,
|
||||
) -> PathBuf {
|
||||
@@ -483,7 +490,7 @@ impl DeltaLayer {
|
||||
.map(char::from)
|
||||
.collect();
|
||||
|
||||
conf.timeline_path(&timeline_id, &tenant_id).join(format!(
|
||||
conf.timeline_path(tenant_id, timeline_id).join(format!(
|
||||
"{}-XXX__{:016X}-{:016X}.{}.{}",
|
||||
key_start,
|
||||
u64::from(lsn_range.start),
|
||||
@@ -605,8 +612,8 @@ impl DeltaLayer {
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.desc.timeline_id,
|
||||
self.desc.tenant_id,
|
||||
&self.desc.tenant_id,
|
||||
&self.desc.timeline_id,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
@@ -654,7 +661,7 @@ impl DeltaLayerWriterInner {
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = DeltaLayer::temp_path_for(conf, timeline_id, tenant_id, key_start, &lsn_range);
|
||||
let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);
|
||||
|
||||
let mut file = VirtualFile::create(&path)?;
|
||||
// make room for the header block
|
||||
@@ -769,8 +776,8 @@ impl DeltaLayerWriterInner {
|
||||
// FIXME: throw an error instead?
|
||||
let final_path = DeltaLayer::path_for(
|
||||
&PathOrConf::Conf(self.conf),
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&DeltaFileName {
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range,
|
||||
@@ -797,7 +804,7 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
|
||||
|
||||
@@ -57,8 +57,9 @@ impl Ord for DeltaFileName {
|
||||
|
||||
/// Represents the filename of a DeltaLayer
|
||||
///
|
||||
/// ```text
|
||||
/// <key start>-<key end>__<LSN start>-<LSN end>
|
||||
///
|
||||
/// ```
|
||||
impl DeltaFileName {
|
||||
///
|
||||
/// Parse a string as a delta file name. Returns None if the filename does not
|
||||
@@ -162,7 +163,9 @@ impl ImageFileName {
|
||||
///
|
||||
/// Represents the filename of an ImageLayer
|
||||
///
|
||||
/// ```text
|
||||
/// <key start>-<key end>__<LSN>
|
||||
/// ```
|
||||
impl ImageFileName {
|
||||
///
|
||||
/// Parse a string as an image file name. Returns None if the filename does not
|
||||
@@ -210,9 +213,15 @@ pub enum LayerFileName {
|
||||
|
||||
impl LayerFileName {
|
||||
pub fn file_name(&self) -> String {
|
||||
self.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for LayerFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::Image(fname) => fname.to_string(),
|
||||
Self::Delta(fname) => fname.to_string(),
|
||||
Self::Image(fname) => write!(f, "{fname}"),
|
||||
Self::Delta(fname) => write!(f, "{fname}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,11 +7,15 @@
|
||||
//! timelines/<timeline_id> directory. Currently, there are no
|
||||
//! subdirectories, and each image layer file is named like this:
|
||||
//!
|
||||
//! ```text
|
||||
//! <key start>-<key end>__<LSN>
|
||||
//! ```
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! ```text
|
||||
//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||
//! ```
|
||||
//!
|
||||
//! Every image layer file consists of three parts: "summary",
|
||||
//! "index", and "values". The summary is a fixed size header at the
|
||||
@@ -53,7 +57,9 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::filename::ImageFileName;
|
||||
use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc};
|
||||
use super::{
|
||||
AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -149,17 +155,18 @@ impl std::fmt::Debug for ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} ----",
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.lsn
|
||||
self.lsn,
|
||||
self.desc.is_incremental,
|
||||
self.desc.file_size
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
@@ -182,18 +189,18 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.desc.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
@@ -211,9 +218,9 @@ impl Layer for ImageLayer {
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Missing))
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -231,18 +238,22 @@ impl Layer for ImageLayer {
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn short_id(&self) -> String {
|
||||
self.layer_desc().short_id()
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for ImageLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ImageLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
@@ -285,7 +296,7 @@ impl ImageLayer {
|
||||
match path_or_conf {
|
||||
PathOrConf::Path(path) => path.to_path_buf(),
|
||||
PathOrConf::Conf(conf) => conf
|
||||
.timeline_path(&timeline_id, &tenant_id)
|
||||
.timeline_path(&tenant_id, &timeline_id)
|
||||
.join(fname.to_string()),
|
||||
}
|
||||
}
|
||||
@@ -302,7 +313,7 @@ impl ImageLayer {
|
||||
.map(char::from)
|
||||
.collect();
|
||||
|
||||
conf.timeline_path(&timeline_id, &tenant_id)
|
||||
conf.timeline_path(&tenant_id, &timeline_id)
|
||||
.join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
|
||||
}
|
||||
|
||||
@@ -653,7 +664,7 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
|
||||
|
||||
@@ -110,7 +110,6 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
@@ -132,13 +131,6 @@ impl Layer for InMemoryLayer {
|
||||
true
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
|
||||
format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
@@ -191,13 +183,13 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
@@ -214,7 +206,7 @@ impl Layer for InMemoryLayer {
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok((reconstruct_state, ValueReconstructResult::Complete));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
@@ -234,13 +226,22 @@ impl Layer for InMemoryLayer {
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Continue))
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for InMemoryLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
|
||||
write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
///
|
||||
/// Get layer size on the disk
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use anyhow::Result;
|
||||
use core::fmt::Display;
|
||||
use std::ops::Range;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -48,8 +49,8 @@ impl PersistentLayerDesc {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn short_id(&self) -> String {
|
||||
self.filename().file_name()
|
||||
pub fn short_id(&self) -> impl Display {
|
||||
self.filename()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -173,13 +174,16 @@ impl PersistentLayerDesc {
|
||||
|
||||
pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
"----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
self.lsn_range.end,
|
||||
self.is_delta,
|
||||
self.is_incremental,
|
||||
self.file_size,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::layer_map::BatchedUpdates;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::timeline::layer_manager::LayerManager;
|
||||
use anyhow::{bail, Result};
|
||||
use pageserver_api::models::HistoricLayerInfo;
|
||||
use std::ops::Range;
|
||||
@@ -20,12 +20,12 @@ use utils::{
|
||||
|
||||
use super::filename::{DeltaFileName, ImageFileName};
|
||||
use super::{
|
||||
DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, ValueReconstructResult,
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||
LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||
/// [`crate::storage_layer::DeltaLayer`].
|
||||
/// [`DeltaLayer`](super::DeltaLayer).
|
||||
///
|
||||
/// RemoteLayer might be downloaded on-demand during operations which are
|
||||
/// allowed download remote layers and during which, it gets replaced with a
|
||||
@@ -50,6 +50,8 @@ pub struct RemoteLayer {
|
||||
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
|
||||
/// a possible fast loop between `Timeline::get_reconstruct_data` and
|
||||
/// `Timeline::download_remote_layer`, which also logs.
|
||||
///
|
||||
/// [`ongoing_download`]: Self::ongoing_download
|
||||
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
@@ -63,31 +65,30 @@ impl std::fmt::Debug for RemoteLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for RemoteLayer {
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_state: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
bail!(
|
||||
"layer {} needs to be downloaded",
|
||||
self.filename().file_name()
|
||||
);
|
||||
_reconstruct_state: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
bail!("layer {self} needs to be downloaded");
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.desc.lsn_range.start,
|
||||
self.desc.lsn_range.end
|
||||
self.desc.lsn_range.end,
|
||||
self.desc.is_delta,
|
||||
self.desc.is_incremental,
|
||||
self.desc.file_size,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -107,18 +108,22 @@ impl Layer for RemoteLayer {
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn short_id(&self) -> String {
|
||||
self.layer_desc().short_id()
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for RemoteLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for RemoteLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for RemoteLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
@@ -219,15 +224,12 @@ impl RemoteLayer {
|
||||
}
|
||||
|
||||
/// Create a Layer struct representing this layer, after it has been downloaded.
|
||||
pub fn create_downloaded_layer<L>(
|
||||
pub fn create_downloaded_layer(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
conf: &'static PageServerConf,
|
||||
file_size: u64,
|
||||
) -> Arc<dyn PersistentLayer>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
) -> Arc<dyn PersistentLayer> {
|
||||
if self.desc.is_delta {
|
||||
let fname = self.desc.delta_file_name();
|
||||
Arc::new(DeltaLayer::new(
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user