mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 18:32:56 +00:00
Compare commits
84 Commits
hyper-1.0
...
sk-basic-b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cfd78950e2 | ||
|
|
734755eaca | ||
|
|
e34166a28f | ||
|
|
3a36a0a227 | ||
|
|
58f6cb649e | ||
|
|
dcc7610ad6 | ||
|
|
4c245b0f5a | ||
|
|
55b7cde665 | ||
|
|
5b34d5f561 | ||
|
|
26c55b0255 | ||
|
|
12e9b2a909 | ||
|
|
918b03b3b0 | ||
|
|
d36623ad74 | ||
|
|
689ad72e92 | ||
|
|
fd4cce9417 | ||
|
|
d52b81340f | ||
|
|
8dee9908f8 | ||
|
|
19ed230708 | ||
|
|
463b6a26b5 | ||
|
|
c9b1657e4c | ||
|
|
b92be77e19 | ||
|
|
8cb8c8d7b5 | ||
|
|
210700d0d9 | ||
|
|
a0a3ba85e7 | ||
|
|
d820aa1d08 | ||
|
|
996abc9563 | ||
|
|
a72af29d12 | ||
|
|
4f51824820 | ||
|
|
743f6dfb9b | ||
|
|
faf275d4a2 | ||
|
|
001f0d6db7 | ||
|
|
42c17a6fc6 | ||
|
|
37638fce79 | ||
|
|
50288c16b1 | ||
|
|
e03f8abba9 | ||
|
|
1905f0bced | ||
|
|
72de1cb511 | ||
|
|
00d9bf5b61 | ||
|
|
71f495c7f7 | ||
|
|
0a7e050144 | ||
|
|
55bfa91bd7 | ||
|
|
d90b2b99df | ||
|
|
27587e155d | ||
|
|
55aede2762 | ||
|
|
9f186b4d3e | ||
|
|
585687d563 | ||
|
|
65a98e425d | ||
|
|
b2e7249979 | ||
|
|
844303255a | ||
|
|
6d8df2579b | ||
|
|
3c3b53f8ad | ||
|
|
30064eb197 | ||
|
|
869acfe29b | ||
|
|
11a91eaf7b | ||
|
|
394ef013d0 | ||
|
|
a718287902 | ||
|
|
2eac1adcb9 | ||
|
|
3f90b2d337 | ||
|
|
a40ed86d87 | ||
|
|
1bf8bb88c5 | ||
|
|
f1901833a6 | ||
|
|
b41ee81308 | ||
|
|
205b6111e6 | ||
|
|
93572a3e99 | ||
|
|
15c0df4de7 | ||
|
|
3290fb09bf | ||
|
|
efdb2bf948 | ||
|
|
5559b16953 | ||
|
|
1aea65eb9d | ||
|
|
34ddec67d9 | ||
|
|
9ace36d93c | ||
|
|
e4898a6e60 | ||
|
|
c77981289c | ||
|
|
f003dd6ad5 | ||
|
|
7e7e9f5191 | ||
|
|
760a48207d | ||
|
|
88df057531 | ||
|
|
c65ac37a6d | ||
|
|
a092127b17 | ||
|
|
e8f773387d | ||
|
|
00936d19e1 | ||
|
|
57155ada77 | ||
|
|
02b916d3c9 | ||
|
|
e6e013b3b7 |
@@ -1,2 +1,2 @@
|
||||
[profile.default]
|
||||
slow-timeout = "1m"
|
||||
slow-timeout = { period = "20s", terminate-after = 3 }
|
||||
|
||||
@@ -69,7 +69,15 @@ jobs:
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
||||
run: |
|
||||
/kaniko/executor \
|
||||
--reproducible \
|
||||
--snapshotMode=redo \
|
||||
--skip-unused-stages \
|
||||
--dockerfile ${{ inputs.dockerfile-path }} \
|
||||
--cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
||||
|
||||
kaniko-arm:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
@@ -85,7 +93,15 @@ jobs:
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
run: |
|
||||
/kaniko/executor \
|
||||
--reproducible \
|
||||
--snapshotMode=redo \
|
||||
--skip-unused-stages \
|
||||
--dockerfile ${{ inputs.dockerfile-path }} \
|
||||
--cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
manifest:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
@@ -99,7 +115,10 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Create manifest
|
||||
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
run: |
|
||||
docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
|
||||
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
|
||||
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
- name: Push manifest
|
||||
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
|
||||
|
||||
44
.github/workflows/build_and_test.yml
vendored
44
.github/workflows/build_and_test.yml
vendored
@@ -21,6 +21,8 @@ env:
|
||||
COPT: '-Werror'
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
@@ -44,6 +46,20 @@ jobs:
|
||||
|
||||
exit 1
|
||||
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Cancel previous e2e-tests runs for this PR
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
gh workflow --repo neondatabase/cloud \
|
||||
run cancel-previous-in-concurrency-group.yml \
|
||||
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
|
||||
|
||||
tag:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -186,7 +202,11 @@ jobs:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
options: --init
|
||||
# Raise locked memory limit for tokio-epoll-uring.
|
||||
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
|
||||
# io_uring will account the memory of the CQ and SQ as locked.
|
||||
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -340,8 +360,12 @@ jobs:
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run rust tests
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
@@ -419,8 +443,8 @@ jobs:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -448,6 +472,7 @@ jobs:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
@@ -458,12 +483,13 @@ jobs:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# the amount of groups (N) should be reflected in `extra_params: --splits N ...`
|
||||
pytest_split_group: [ 1, 2, 3, 4 ]
|
||||
build_type: [ release ]
|
||||
steps:
|
||||
@@ -477,11 +503,12 @@ jobs:
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
|
||||
extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
@@ -695,7 +722,8 @@ jobs:
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
|
||||
14
.github/workflows/neon_extra_builds.yml
vendored
14
.github/workflows/neon_extra_builds.yml
vendored
@@ -124,12 +124,12 @@ jobs:
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
CARGO_FEATURES: --features testing
|
||||
CARGO_FLAGS: --locked --release
|
||||
CARGO_FLAGS: --release
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -210,18 +210,20 @@ jobs:
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run cargo test
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||
cargo nextest run $CARGO_FEATURES
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
cargo nextest run --package remote_storage --test test_real_s3
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
@@ -231,7 +233,7 @@ jobs:
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
cargo nextest run --package remote_storage --test test_real_azure
|
||||
|
||||
check-codestyle-rust-arm:
|
||||
timeout-minutes: 90
|
||||
|
||||
122
.github/workflows/update_build_tools_image.yml
vendored
122
.github/workflows/update_build_tools_image.yml
vendored
@@ -20,111 +20,51 @@ defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
env:
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
outputs:
|
||||
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
|
||||
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Get source image digest
|
||||
id: next-digest
|
||||
run: |
|
||||
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
|
||||
if [ -z "${NEXT_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
|
||||
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Get destination image digest (if already exists)
|
||||
id: prev-digest
|
||||
run: |
|
||||
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
|
||||
if [ -z "${PREV_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
|
||||
else
|
||||
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
|
||||
|
||||
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Tag image
|
||||
run: |
|
||||
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
|
||||
|
||||
rollback-tag-image:
|
||||
needs: tag-image
|
||||
if: ${{ !success() }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
env:
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- name: Configure ECR login
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.21'
|
||||
|
||||
- name: Install crane
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
|
||||
|
||||
- name: Restore previous tag if needed
|
||||
- name: Copy images
|
||||
run: |
|
||||
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
|
||||
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
|
||||
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
|
||||
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
|
||||
|
||||
if [ -z "${NEXT_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ -z "${PREV_DIGEST}" ]; then
|
||||
# I guess we should delete the tag here/untag the image, but crane does not support it
|
||||
# - https://github.com/google/go-containerregistry/issues/999
|
||||
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
|
||||
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
|
||||
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
|
||||
|
||||
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
|
||||
else
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
|
||||
fi
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
800
Cargo.lock
generated
800
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
35
Cargo.toml
35
Cargo.toml
@@ -51,7 +51,7 @@ async-trait = "0.1"
|
||||
aws-config = { version = "1.0", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "1.0"
|
||||
aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
|
||||
aws-smithy-types = { version = "1.1.2", features = ["http-body-1-x"] }
|
||||
aws-smithy-types = "1.0"
|
||||
aws-credential-types = "1.0"
|
||||
axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
@@ -89,12 +89,8 @@ hostname = "0.3.1"
|
||||
http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "1.0.0"
|
||||
hyper-util = "0.1.0"
|
||||
http = "1"
|
||||
http-body = "1"
|
||||
http-body-util = "0.1"
|
||||
hyper-tungstenite = "0.13.0"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.11"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
@@ -103,29 +99,29 @@ libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
native-tls = "0.2"
|
||||
nix = "0.26"
|
||||
notify = "5.0.0"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.19.0"
|
||||
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.11.0"
|
||||
opentelemetry = "0.20.0"
|
||||
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.12.0"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "49.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.12"
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
||||
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
|
||||
reqwest-middleware = "0.2.0"
|
||||
reqwest-retry = "0.2.2"
|
||||
routerify = { git = "https://github.com/conradludgate/routerify", branch = "hyper1" }
|
||||
routerify = "3"
|
||||
rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
rustls = "0.21"
|
||||
@@ -153,8 +149,9 @@ tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
test-context = "0.1"
|
||||
thiserror = "1.0"
|
||||
tls-listener = { version = "0.9", features = ["rustls", "tokio-net"] }
|
||||
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.10.0"
|
||||
tokio-rustls = "0.24"
|
||||
@@ -163,10 +160,10 @@ tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
tonic = {version = "0.10", features = ["tls", "tls-roots"]}
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.19.0"
|
||||
tracing-opentelemetry = "0.20.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
@@ -215,7 +212,7 @@ criterion = "0.5.1"
|
||||
rcgen = "0.11"
|
||||
rstest = "0.18"
|
||||
camino-tempfile = "1.0.2"
|
||||
tonic-build = "0.10.2"
|
||||
tonic-build = "0.9"
|
||||
|
||||
[patch.crates-io]
|
||||
|
||||
|
||||
@@ -52,7 +52,7 @@ RUN cd postgres && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
@@ -63,14 +63,14 @@ RUN cd postgres && \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done
|
||||
done
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -143,29 +143,24 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
|
||||
#########################################################################################
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libncurses5 binutils clang
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export PLV8_VERSION=3.1.5 \
|
||||
export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
|
||||
;; \
|
||||
"v16") \
|
||||
export PLV8_VERSION=3.1.8 \
|
||||
export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
|
||||
;; \
|
||||
*) \
|
||||
echo "Export the valid PG_VERSION variable" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
|
||||
echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
|
||||
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
|
||||
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
|
||||
# generate and copy upgrade scripts
|
||||
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
|
||||
cp upgrade/* /usr/local/pgsql/share/extension/ && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
# don't break computes with installed old version of plv8
|
||||
cd /usr/local/pgsql/lib/ && \
|
||||
ln -s plv8-3.1.10.so plv8-3.1.5.so && \
|
||||
ln -s plv8-3.1.10.so plv8-3.1.8.so && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
|
||||
@@ -551,6 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
|
||||
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
|
||||
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
|
||||
-D RDK_INSTALL_INTREE=OFF \
|
||||
-D RDK_INSTALL_COMIC_FONTS=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
@@ -617,6 +613,7 @@ RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O
|
||||
FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
@@ -779,6 +776,8 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# Public extensions
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=postgis-build /sfcgal/* /
|
||||
@@ -883,8 +882,10 @@ FROM debian:bullseye-slim
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
mkdir /var/db/postgres/pgbouncer && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
chmod 0750 /var/db/postgres/pgbouncer && \
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache
|
||||
|
||||
@@ -12,11 +12,7 @@ cfg-if.workspace = true
|
||||
clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
hyper = { workspace = true, features = ["server"] }
|
||||
hyper-util = { workspace = true, features = ["tokio", "server", "server-auto"] }
|
||||
http = { workspace = true, features = [] }
|
||||
http-body = { workspace = true, features = [] }
|
||||
http-body-util = { workspace = true, features = [] }
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
num_cpus.workspace = true
|
||||
|
||||
@@ -32,8 +32,6 @@
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
|
||||
//! --pgbouncer-ini-path /etc/pgbouncer.ini \
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
@@ -112,9 +110,6 @@ fn main() -> Result<()> {
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
|
||||
let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
|
||||
let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
|
||||
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
@@ -225,8 +220,6 @@ fn main() -> Result<()> {
|
||||
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
build_tag,
|
||||
pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
|
||||
pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
@@ -523,23 +516,6 @@ fn cli() -> clap::Command {
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbouncer-connstr")
|
||||
.long("pgbouncer-connstr")
|
||||
.default_value(
|
||||
"host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
|
||||
)
|
||||
.value_name("PGBOUNCER_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbouncer-ini-path")
|
||||
.long("pgbouncer-ini-path")
|
||||
// Note: this doesn't match current path for pgbouncer.ini.
|
||||
// Until we fix it, we need to pass the path explicitly
|
||||
// or this will be effectively no-op.
|
||||
.default_value("/etc/pgbouncer.ini")
|
||||
.value_name("PGBOUNCER_INI_PATH"),
|
||||
)
|
||||
}
|
||||
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
|
||||
@@ -71,10 +71,6 @@ pub struct ComputeNode {
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub build_tag: String,
|
||||
// connection string to pgbouncer to change settings
|
||||
pub pgbouncer_connstr: Option<String>,
|
||||
// path to pgbouncer.ini to change settings
|
||||
pub pgbouncer_ini_path: Option<String>,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
@@ -704,13 +700,14 @@ impl ComputeNode {
|
||||
// In this case we need to connect with old `zenith_admin` name
|
||||
// and create new user. We cannot simply rename connected user,
|
||||
// but we can create a new one and grant it all privileges.
|
||||
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
|
||||
let connstr = self.connstr.clone();
|
||||
let mut client = match Client::connect(connstr.as_str(), NoTls) {
|
||||
Err(e) => {
|
||||
info!(
|
||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||
e
|
||||
);
|
||||
let mut zenith_admin_connstr = self.connstr.clone();
|
||||
let mut zenith_admin_connstr = connstr.clone();
|
||||
|
||||
zenith_admin_connstr
|
||||
.set_username("zenith_admin")
|
||||
@@ -723,8 +720,8 @@ impl ComputeNode {
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
|
||||
// reconnect with connsting with expected name
|
||||
Client::connect(self.connstr.as_str(), NoTls)?
|
||||
// reconnect with connstring with expected name
|
||||
Client::connect(connstr.as_str(), NoTls)?
|
||||
}
|
||||
Ok(client) => client,
|
||||
};
|
||||
@@ -738,8 +735,8 @@ impl ComputeNode {
|
||||
cleanup_instance(&mut client)?;
|
||||
handle_roles(spec, &mut client)?;
|
||||
handle_databases(spec, &mut client)?;
|
||||
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(spec, &mut client, self.connstr.as_str())?;
|
||||
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
|
||||
handle_grants(spec, &mut client, connstr.as_str())?;
|
||||
handle_extensions(spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
create_availability_check_data(&mut client)?;
|
||||
@@ -747,6 +744,12 @@ impl ComputeNode {
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
if self.has_feature(ComputeFeature::Migrations) {
|
||||
thread::spawn(move || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
handle_migrations(&mut client)
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -769,8 +772,8 @@ impl ComputeNode {
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
if let Some(connstr) = &self.pgbouncer_connstr {
|
||||
info!("tuning pgbouncer with connstr: {:?}", connstr);
|
||||
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
@@ -779,15 +782,9 @@ impl ComputeNode {
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = spec.pgbouncer_settings.clone();
|
||||
let connstr_clone = connstr.clone();
|
||||
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(
|
||||
pgbouncer_settings,
|
||||
&connstr_clone,
|
||||
pgbouncer_ini_path,
|
||||
));
|
||||
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
@@ -817,6 +814,10 @@ impl ComputeNode {
|
||||
handle_grants(&spec, &mut client, self.connstr.as_str())?;
|
||||
handle_extensions(&spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
// We can skip handle_migrations here because a new migration can only appear
|
||||
// if we have a new version of the compute_ctl binary, which can only happen
|
||||
// if compute got restarted, in which case we'll end up inside of apply_config
|
||||
// instead of reconfigure.
|
||||
}
|
||||
|
||||
// 'Close' connection
|
||||
@@ -852,8 +853,8 @@ impl ComputeNode {
|
||||
);
|
||||
|
||||
// tune pgbouncer
|
||||
if let Some(connstr) = &self.pgbouncer_connstr {
|
||||
info!("tuning pgbouncer with connstr: {:?}", connstr);
|
||||
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
@@ -862,15 +863,9 @@ impl ComputeNode {
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
|
||||
let connstr_clone = connstr.clone();
|
||||
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(
|
||||
pgbouncer_settings,
|
||||
&connstr_clone,
|
||||
pgbouncer_ini_path,
|
||||
));
|
||||
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
|
||||
@@ -6,22 +6,14 @@ use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use bytes::Bytes;
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
||||
|
||||
use anyhow::Result;
|
||||
use http_body_util::BodyExt;
|
||||
use http_body_util::Full;
|
||||
use hyper::body::Incoming;
|
||||
use hyper::service::service_fn;
|
||||
use hyper::{Method, Request, Response, StatusCode};
|
||||
use hyper_util::rt::TokioExecutor;
|
||||
use hyper_util::rt::TokioIo;
|
||||
use hyper_util::server::conn;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use num_cpus;
|
||||
use serde_json;
|
||||
use tokio::net::TcpListener;
|
||||
use tokio::task;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing_utils::http::OtelName;
|
||||
@@ -44,7 +36,7 @@ fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
|
||||
}
|
||||
|
||||
// Service function to handle all available routes.
|
||||
async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<Full<Bytes>> {
|
||||
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
|
||||
//
|
||||
// NOTE: The URI path is currently included in traces. That's OK because
|
||||
// it doesn't contain any variable parts or sensitive information. But
|
||||
@@ -56,7 +48,7 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
info!("serving /status GET request");
|
||||
let state = compute.state.lock().unwrap();
|
||||
let status_response = status_response_from_state(&state);
|
||||
Response::new(Full::from(serde_json::to_string(&status_response).unwrap()))
|
||||
Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
|
||||
}
|
||||
|
||||
// Startup metrics in JSON format. Keep /metrics reserved for a possible
|
||||
@@ -64,7 +56,7 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
(&Method::GET, "/metrics.json") => {
|
||||
info!("serving /metrics.json GET request");
|
||||
let metrics = compute.state.lock().unwrap().metrics.clone();
|
||||
Response::new(Full::from(serde_json::to_string(&metrics).unwrap()))
|
||||
Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
|
||||
}
|
||||
|
||||
// Collect Postgres current usage insights
|
||||
@@ -74,11 +66,11 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
if status != ComputeStatus::Running {
|
||||
let msg = format!("compute is not running, current status: {:?}", status);
|
||||
error!(msg);
|
||||
return Response::new(Full::from(msg));
|
||||
return Response::new(Body::from(msg));
|
||||
}
|
||||
|
||||
let insights = compute.collect_insights().await;
|
||||
Response::new(Full::from(insights))
|
||||
Response::new(Body::from(insights))
|
||||
}
|
||||
|
||||
(&Method::POST, "/check_writability") => {
|
||||
@@ -90,15 +82,15 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
status
|
||||
);
|
||||
error!(msg);
|
||||
return Response::new(Full::from(msg));
|
||||
return Response::new(Body::from(msg));
|
||||
}
|
||||
|
||||
let res = crate::checker::check_writability(compute).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Full::from("true")),
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
Err(e) => {
|
||||
error!("check_writability failed: {}", e);
|
||||
Response::new(Full::from(e.to_string()))
|
||||
Response::new(Body::from(e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -106,7 +98,7 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
(&Method::GET, "/info") => {
|
||||
let num_cpus = num_cpus::get_physical();
|
||||
info!("serving /info GET request. num_cpus: {}", num_cpus);
|
||||
Response::new(Full::from(
|
||||
Response::new(Body::from(
|
||||
serde_json::json!({
|
||||
"num_cpus": num_cpus,
|
||||
})
|
||||
@@ -123,7 +115,7 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
(&Method::POST, "/configure") => {
|
||||
info!("serving /configure POST request");
|
||||
match handle_configure_request(req, compute).await {
|
||||
Ok(msg) => Response::new(Full::from(msg)),
|
||||
Ok(msg) => Response::new(Body::from(msg)),
|
||||
Err((msg, code)) => {
|
||||
error!("error handling /configure request: {msg}");
|
||||
render_json_error(&msg, code)
|
||||
@@ -140,7 +132,7 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
// if no remote storage is configured
|
||||
if compute.ext_remote_storage.is_none() {
|
||||
info!("no extensions remote storage configured");
|
||||
let mut resp = Response::new(Full::from("no remote storage configured"));
|
||||
let mut resp = Response::new(Body::from("no remote storage configured"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return resp;
|
||||
}
|
||||
@@ -151,7 +143,7 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
if params == "is_library=true" {
|
||||
is_library = true;
|
||||
} else {
|
||||
let mut resp = Response::new(Full::from("Wrong request parameters"));
|
||||
let mut resp = Response::new(Body::from("Wrong request parameters"));
|
||||
*resp.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return resp;
|
||||
}
|
||||
@@ -173,7 +165,7 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
Some(r) => r,
|
||||
None => {
|
||||
info!("no remote extensions spec was provided");
|
||||
let mut resp = Response::new(Full::from("no remote storage configured"));
|
||||
let mut resp = Response::new(Body::from("no remote storage configured"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return resp;
|
||||
}
|
||||
@@ -190,10 +182,10 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
match ext {
|
||||
Ok((ext_name, ext_path)) => {
|
||||
match compute.download_extension(ext_name, ext_path).await {
|
||||
Ok(_) => Response::new(Full::from("OK")),
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("extension download failed: {}", e);
|
||||
let mut resp = Response::new(Full::from(e.to_string()));
|
||||
let mut resp = Response::new(Body::from(e.to_string()));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
@@ -201,7 +193,7 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("extension download failed to find extension: {}", e);
|
||||
let mut resp = Response::new(Full::from("failed to find file"));
|
||||
let mut resp = Response::new(Body::from("failed to find file"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
@@ -210,7 +202,7 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Full::from("404 Not Found"));
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
*not_found.status_mut() = StatusCode::NOT_FOUND;
|
||||
not_found
|
||||
}
|
||||
@@ -218,7 +210,7 @@ async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<
|
||||
}
|
||||
|
||||
async fn handle_configure_request(
|
||||
req: Request<Incoming>,
|
||||
req: Request<Body>,
|
||||
compute: &Arc<ComputeNode>,
|
||||
) -> Result<String, (String, StatusCode)> {
|
||||
if !compute.live_config_allowed {
|
||||
@@ -228,7 +220,7 @@ async fn handle_configure_request(
|
||||
));
|
||||
}
|
||||
|
||||
let body_bytes = req.into_body().collect().await.unwrap().to_bytes();
|
||||
let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
|
||||
let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
|
||||
if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
|
||||
let spec = request.spec;
|
||||
@@ -295,13 +287,13 @@ async fn handle_configure_request(
|
||||
}
|
||||
}
|
||||
|
||||
fn render_json_error(e: &str, status: StatusCode) -> Response<Full<Bytes>> {
|
||||
fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
let error = GenericAPIError {
|
||||
error: e.to_string(),
|
||||
};
|
||||
Response::builder()
|
||||
.status(status)
|
||||
.body(Full::from(serde_json::to_string(&error).unwrap()))
|
||||
.body(Body::from(serde_json::to_string(&error).unwrap()))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
@@ -312,43 +304,35 @@ async fn serve(port: u16, state: Arc<ComputeNode>) {
|
||||
// see e.g. https://github.com/rust-lang/rust/pull/34440
|
||||
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
|
||||
|
||||
let service = service_fn(move |req: Request<Incoming>| {
|
||||
let make_service = make_service_fn(move |_conn| {
|
||||
let state = state.clone();
|
||||
async move {
|
||||
Ok::<_, Infallible>(
|
||||
// NOTE: We include the URI path in the string. It
|
||||
// doesn't contain any variable parts or sensitive
|
||||
// information in this API.
|
||||
tracing_utils::http::tracing_handler(
|
||||
req,
|
||||
|req| routes(req, &state),
|
||||
OtelName::UriPath,
|
||||
)
|
||||
.await,
|
||||
)
|
||||
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
|
||||
let state = state.clone();
|
||||
async move {
|
||||
Ok::<_, Infallible>(
|
||||
// NOTE: We include the URI path in the string. It
|
||||
// doesn't contain any variable parts or sensitive
|
||||
// information in this API.
|
||||
tracing_utils::http::tracing_handler(
|
||||
req,
|
||||
|req| routes(req, &state),
|
||||
OtelName::UriPath,
|
||||
)
|
||||
.await,
|
||||
)
|
||||
}
|
||||
}))
|
||||
}
|
||||
});
|
||||
|
||||
info!("starting HTTP server on {}", addr);
|
||||
|
||||
let listener = TcpListener::bind(addr).await.unwrap();
|
||||
loop {
|
||||
let (stream, _) = match listener.accept().await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
error!("server error: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
let io = TokioIo::new(stream);
|
||||
let service = service.clone();
|
||||
tokio::task::spawn(async move {
|
||||
let builder = conn::auto::Builder::new(TokioExecutor::new());
|
||||
let res = builder.serve_connection(io, service).await;
|
||||
if let Err(err) = res {
|
||||
println!("Error serving connection: {:?}", err);
|
||||
}
|
||||
});
|
||||
let server = Server::bind(&addr).serve(make_service);
|
||||
|
||||
// Run this server forever
|
||||
if let Err(e) = server.await {
|
||||
error!("server error: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -366,7 +366,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
|
||||
}
|
||||
|
||||
/// Update pgbouncer.ini with provided options
|
||||
pub fn update_pgbouncer_ini(
|
||||
fn update_pgbouncer_ini(
|
||||
pgbouncer_config: HashMap<String, String>,
|
||||
pgbouncer_ini_path: &str,
|
||||
) -> Result<()> {
|
||||
@@ -375,6 +375,10 @@ pub fn update_pgbouncer_ini(
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
section.insert(option_name, value);
|
||||
debug!(
|
||||
"Updating pgbouncer.ini with new values {}={}",
|
||||
option_name, value
|
||||
);
|
||||
}
|
||||
|
||||
conf.write_to_file(pgbouncer_ini_path)?;
|
||||
@@ -384,49 +388,80 @@ pub fn update_pgbouncer_ini(
|
||||
/// Tune pgbouncer.
|
||||
/// 1. Apply new config using pgbouncer admin console
|
||||
/// 2. Add new values to pgbouncer.ini to preserve them after restart
|
||||
pub async fn tune_pgbouncer(
|
||||
pgbouncer_settings: Option<HashMap<String, String>>,
|
||||
pgbouncer_connstr: &str,
|
||||
pgbouncer_ini_path: Option<String>,
|
||||
) -> Result<()> {
|
||||
if let Some(pgbouncer_config) = pgbouncer_settings {
|
||||
// Apply new config
|
||||
let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
|
||||
let (client, connection) = connect_result.unwrap();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
|
||||
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
|
||||
// for VMs use pgbouncer specific way to connect to
|
||||
// pgbouncer admin console without password
|
||||
// when pgbouncer is running under the same user.
|
||||
"host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string()
|
||||
} else {
|
||||
// for k8s use normal connection string with password
|
||||
// to connect to pgbouncer admin console
|
||||
let mut pgbouncer_connstr =
|
||||
"host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string();
|
||||
if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") {
|
||||
pgbouncer_connstr.push_str(format!(" password={}", pass).as_str());
|
||||
}
|
||||
pgbouncer_connstr
|
||||
};
|
||||
|
||||
info!(
|
||||
"Connecting to pgbouncer with connection string: {}",
|
||||
pgbouncer_connstr
|
||||
);
|
||||
|
||||
// connect to pgbouncer, retrying several times
|
||||
// because pgbouncer may not be ready yet
|
||||
let mut retries = 3;
|
||||
let client = loop {
|
||||
match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await {
|
||||
Ok((client, connection)) => {
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
break client;
|
||||
}
|
||||
});
|
||||
Err(e) => {
|
||||
if retries == 0 {
|
||||
return Err(e.into());
|
||||
}
|
||||
error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e);
|
||||
retries -= 1;
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
info!(
|
||||
"Applying pgbouncer setting change: {} = {}",
|
||||
option_name, value
|
||||
// Apply new config
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
let query = format!("SET {}={}", option_name, value);
|
||||
// keep this log line for debugging purposes
|
||||
info!("Applying pgbouncer setting change: {}", query);
|
||||
|
||||
if let Err(err) = client.simple_query(&query).await {
|
||||
// Don't fail on error, just print it into log
|
||||
error!(
|
||||
"Failed to apply pgbouncer setting change: {}, {}",
|
||||
query, err
|
||||
);
|
||||
let query = format!("SET {} = {}", option_name, value);
|
||||
|
||||
let result = client.simple_query(&query).await;
|
||||
|
||||
info!("Applying pgbouncer setting change: {}", query);
|
||||
info!("pgbouncer setting change result: {:?}", result);
|
||||
|
||||
if let Err(err) = result {
|
||||
// Don't fail on error, just print it into log
|
||||
error!(
|
||||
"Failed to apply pgbouncer setting change: {}, {}",
|
||||
query, err
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
// save values to pgbouncer.ini
|
||||
// so that they are preserved after pgbouncer restart
|
||||
if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
|
||||
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// save values to pgbouncer.ini
|
||||
// so that they are preserved after pgbouncer restart
|
||||
let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() {
|
||||
// in VMs we use /etc/pgbouncer.ini
|
||||
"/etc/pgbouncer.ini".to_string()
|
||||
} else {
|
||||
// in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini
|
||||
// this is a shared volume between pgbouncer and postgres containers
|
||||
// FIXME: fix permissions for this file
|
||||
"/var/db/postgres/pgbouncer/pgbouncer.ini".to_string()
|
||||
};
|
||||
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -727,3 +727,79 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
info!("handle migrations");
|
||||
|
||||
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
// !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
|
||||
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
let migrations = [
|
||||
"ALTER ROLE neon_superuser BYPASSRLS",
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
|
||||
END LOOP;
|
||||
|
||||
FOR role_name IN SELECT rolname FROM pg_roles
|
||||
WHERE
|
||||
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
|
||||
END LOOP;
|
||||
END $$;
|
||||
"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client.query_one(query, &[])?;
|
||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
||||
let starting_migration_id = current_migration;
|
||||
|
||||
query = "BEGIN";
|
||||
client.simple_query(query)?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
info!("Running migration:\n{}\n", migrations[current_migration]);
|
||||
client.simple_query(migrations[current_migration])?;
|
||||
current_migration += 1;
|
||||
}
|
||||
let setval = format!(
|
||||
"UPDATE neon_migration.migration_id SET id={}",
|
||||
migrations.len()
|
||||
);
|
||||
client.simple_query(&setval)?;
|
||||
|
||||
query = "COMMIT";
|
||||
client.simple_query(query)?;
|
||||
|
||||
info!(
|
||||
"Ran {} migrations",
|
||||
(migrations.len() - starting_migration_id)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -10,6 +10,8 @@ async-trait.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
diesel = { version = "2.1.4", features = ["postgres"]}
|
||||
diesel_migrations = { version = "2.1.0", features = ["postgres"]}
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
nix.workspace = true
|
||||
@@ -19,6 +21,7 @@ hex.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
|
||||
@@ -25,6 +25,8 @@ tracing.workspace = true
|
||||
# a parsing function when loading pageservers from neon_local LocalEnv
|
||||
postgres_backend.workspace = true
|
||||
|
||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
|
||||
|
||||
utils = { path = "../../libs/utils/" }
|
||||
metrics = { path = "../../libs/metrics/" }
|
||||
control_plane = { path = ".." }
|
||||
|
||||
0
control_plane/attachment_service/migrations/.keep
Normal file
0
control_plane/attachment_service/migrations/.keep
Normal file
@@ -0,0 +1,6 @@
|
||||
-- This file was automatically created by Diesel to setup helper functions
|
||||
-- and other internal bookkeeping. This file is safe to edit, any future
|
||||
-- changes will be added to existing projects as new migrations.
|
||||
|
||||
DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
|
||||
DROP FUNCTION IF EXISTS diesel_set_updated_at();
|
||||
@@ -0,0 +1,36 @@
|
||||
-- This file was automatically created by Diesel to setup helper functions
|
||||
-- and other internal bookkeeping. This file is safe to edit, any future
|
||||
-- changes will be added to existing projects as new migrations.
|
||||
|
||||
|
||||
|
||||
|
||||
-- Sets up a trigger for the given table to automatically set a column called
|
||||
-- `updated_at` whenever the row is modified (unless `updated_at` was included
|
||||
-- in the modified columns)
|
||||
--
|
||||
-- # Example
|
||||
--
|
||||
-- ```sql
|
||||
-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
|
||||
--
|
||||
-- SELECT diesel_manage_updated_at('users');
|
||||
-- ```
|
||||
CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
|
||||
BEGIN
|
||||
EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
|
||||
FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
|
||||
BEGIN
|
||||
IF (
|
||||
NEW IS DISTINCT FROM OLD AND
|
||||
NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
|
||||
) THEN
|
||||
NEW.updated_at := current_timestamp;
|
||||
END IF;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
@@ -0,0 +1 @@
|
||||
DROP TABLE tenant_shards;
|
||||
@@ -0,0 +1,12 @@
|
||||
CREATE TABLE tenant_shards (
|
||||
tenant_id VARCHAR NOT NULL,
|
||||
shard_number INTEGER NOT NULL,
|
||||
shard_count INTEGER NOT NULL,
|
||||
PRIMARY KEY(tenant_id, shard_number, shard_count),
|
||||
shard_stripe_size INTEGER NOT NULL,
|
||||
generation INTEGER NOT NULL,
|
||||
generation_pageserver BIGINT NOT NULL,
|
||||
placement_policy VARCHAR NOT NULL,
|
||||
-- config is JSON encoded, opaque to the database.
|
||||
config TEXT NOT NULL
|
||||
);
|
||||
@@ -0,0 +1 @@
|
||||
DROP TABLE nodes;
|
||||
@@ -0,0 +1,10 @@
|
||||
CREATE TABLE nodes (
|
||||
node_id BIGINT PRIMARY KEY NOT NULL,
|
||||
|
||||
scheduling_policy VARCHAR NOT NULL,
|
||||
|
||||
listen_http_addr VARCHAR NOT NULL,
|
||||
listen_http_port INTEGER NOT NULL,
|
||||
listen_pg_addr VARCHAR NOT NULL,
|
||||
listen_pg_port INTEGER NOT NULL
|
||||
);
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::Service;
|
||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
|
||||
@@ -104,34 +104,34 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
|
||||
json_response(StatusCode::OK, state.service.inspect(inspect_req))
|
||||
}
|
||||
|
||||
async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn handle_tenant_create(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.tenant_create(create_req).await?,
|
||||
)
|
||||
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn handle_tenant_timeline_create(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn handle_tenant_locate(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
|
||||
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -154,14 +154,15 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn handle_tenant_shard_migrate(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
service
|
||||
.tenant_shard_migrate(tenant_shard_id, migrate_req)
|
||||
.await?,
|
||||
)
|
||||
@@ -178,6 +179,35 @@ impl From<ReconcileError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
|
||||
where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
let state = get_state(&request);
|
||||
let service = state.service.clone();
|
||||
|
||||
let startup_complete = service.startup_complete.clone();
|
||||
if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
// This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
|
||||
// timeouts around its remote calls, to bound its runtime.
|
||||
return Err(ApiError::Timeout(
|
||||
"Timed out waiting for service readiness".into(),
|
||||
));
|
||||
}
|
||||
|
||||
request_span(
|
||||
request,
|
||||
|request| async move { handler(service, request).await },
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn make_router(
|
||||
service: Arc<Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
@@ -205,14 +235,20 @@ pub fn make_router(
|
||||
.put("/node/:node_id/config", |r| {
|
||||
request_span(r, handle_node_configure)
|
||||
})
|
||||
.post("/tenant", |r| request_span(r, handle_tenant_create))
|
||||
.post("/tenant/:tenant_id/timeline", |r| {
|
||||
request_span(r, handle_tenant_timeline_create)
|
||||
.post("/v1/tenant", |r| {
|
||||
tenant_service_handler(r, handle_tenant_create)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_create)
|
||||
})
|
||||
.get("/tenant/:tenant_id/locate", |r| {
|
||||
request_span(r, handle_tenant_locate)
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
.put("/tenant/:tenant_shard_id/migrate", |r| {
|
||||
request_span(r, handle_tenant_shard_migrate)
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
// Path aliases for tests_forward_compatibility
|
||||
// TODO: remove these in future PR
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ mod node;
|
||||
pub mod persistence;
|
||||
mod reconciler;
|
||||
mod scheduler;
|
||||
mod schema;
|
||||
pub mod service;
|
||||
mod tenant_state;
|
||||
|
||||
@@ -17,6 +18,8 @@ enum PlacementPolicy {
|
||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||
/// some number of secondaries.
|
||||
Double(usize),
|
||||
/// Do not attach to any pageservers
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
|
||||
|
||||
@@ -12,9 +12,9 @@ use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use std::sync::Arc;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::logging::{self, LogFormat};
|
||||
use utils::signals::{ShutdownSignals, Signal};
|
||||
|
||||
use utils::{project_build_tag, project_git_version, tcp_listener};
|
||||
|
||||
@@ -40,6 +40,10 @@ struct Cli {
|
||||
/// Path to the .json file to store state (will be created if it doesn't exist)
|
||||
#[arg(short, long)]
|
||||
path: Utf8PathBuf,
|
||||
|
||||
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
||||
#[arg(long)]
|
||||
database_url: String,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -66,9 +70,14 @@ async fn main() -> anyhow::Result<()> {
|
||||
jwt_token: args.jwt_token,
|
||||
};
|
||||
|
||||
let persistence = Arc::new(Persistence::new(&args.path).await);
|
||||
let json_path = if args.path.as_os_str().is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(args.path)
|
||||
};
|
||||
let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
|
||||
|
||||
let service = Service::spawn(config, persistence).await?;
|
||||
let service = Service::spawn(config, persistence.clone()).await?;
|
||||
|
||||
let http_listener = tcp_listener::bind(args.listen)?;
|
||||
|
||||
@@ -81,20 +90,31 @@ async fn main() -> anyhow::Result<()> {
|
||||
let router = make_router(service, auth)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
||||
let router_service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
|
||||
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
|
||||
tokio::task::spawn(server);
|
||||
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
|
||||
tracing::info!("Got {}. Terminating", signal.name());
|
||||
// We're just a test helper: no graceful shutdown.
|
||||
std::process::exit(0);
|
||||
}
|
||||
})?;
|
||||
// Wait until we receive a signal
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
|
||||
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
|
||||
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
|
||||
tokio::select! {
|
||||
_ = sigint.recv() => {},
|
||||
_ = sigterm.recv() => {},
|
||||
_ = sigquit.recv() => {},
|
||||
}
|
||||
tracing::info!("Terminating on signal");
|
||||
|
||||
Ok(())
|
||||
if json_path.is_some() {
|
||||
// Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
|
||||
// full postgres dumps around.
|
||||
if let Err(e) = persistence.write_tenants_json().await {
|
||||
tracing::error!("Failed to write JSON on shutdown: {e}")
|
||||
}
|
||||
}
|
||||
|
||||
std::process::exit(0);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::persistence::NodePersistence;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Node {
|
||||
pub(crate) id: NodeId,
|
||||
@@ -34,4 +36,15 @@ impl Node {
|
||||
NodeSchedulingPolicy::Pause => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn to_persistent(&self) -> NodePersistence {
|
||||
NodePersistence {
|
||||
node_id: self.id.0 as i64,
|
||||
scheduling_policy: self.scheduling.into(),
|
||||
listen_http_addr: self.listen_http_addr.clone(),
|
||||
listen_http_port: self.listen_http_port as i32,
|
||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
||||
listen_pg_port: self.listen_pg_port as i32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,139 +1,161 @@
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use control_plane::{
|
||||
attachment_service::{NodeAvailability, NodeSchedulingPolicy},
|
||||
local_env::LocalEnv,
|
||||
};
|
||||
use pageserver_api::{
|
||||
models::TenantConfig,
|
||||
shard::{ShardCount, ShardNumber, TenantShardId},
|
||||
};
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use postgres_connection::parse_host_port;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use crate::{node::Node, PlacementPolicy};
|
||||
use crate::node::Node;
|
||||
use crate::PlacementPolicy;
|
||||
|
||||
/// Placeholder for storage. This will be replaced with a database client.
|
||||
/// ## What do we store?
|
||||
///
|
||||
/// The attachment service does not store most of its state durably.
|
||||
///
|
||||
/// The essential things to store durably are:
|
||||
/// - generation numbers, as these must always advance monotonically to ensure data safety.
|
||||
/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
|
||||
/// - Node's scheduling policies, as the source of truth for these is something external.
|
||||
///
|
||||
/// Other things we store durably as an implementation detail:
|
||||
/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
|
||||
/// but it is operationally simpler to make this service the authority for which nodes
|
||||
/// it talks to.
|
||||
///
|
||||
/// ## Performance/efficiency
|
||||
///
|
||||
/// The attachment service does not go via the database for most things: there are
|
||||
/// a couple of places where we must, and where efficiency matters:
|
||||
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
|
||||
/// before it can attach a tenant, so this acts as a bound on how fast things like
|
||||
/// failover can happen.
|
||||
/// - Pageserver re-attach: we will increment many shards' generations when this happens,
|
||||
/// so it is important to avoid e.g. issuing O(N) queries.
|
||||
///
|
||||
/// Database calls relating to nodes have low performance requirements, as they are very rarely
|
||||
/// updated, and reads of nodes are always from memory, not the database. We only require that
|
||||
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
|
||||
pub struct Persistence {
|
||||
state: std::sync::Mutex<PersistentState>,
|
||||
database_url: String,
|
||||
|
||||
// In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of
|
||||
// test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
|
||||
// compatible just yet.
|
||||
json_path: Option<Utf8PathBuf>,
|
||||
}
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
/// Legacy format, for use in JSON compat objects in test environment
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PersistentState {
|
||||
struct JsonPersistence {
|
||||
tenants: HashMap<TenantShardId, TenantShardPersistence>,
|
||||
|
||||
#[serde(skip)]
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
/// A convenience for serializing the state inside a sync lock, and then
|
||||
/// writing it to disk outside of the lock. This will go away when switching
|
||||
/// to a database backend.
|
||||
struct PendingWrite {
|
||||
bytes: Vec<u8>,
|
||||
path: Utf8PathBuf,
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum DatabaseError {
|
||||
#[error(transparent)]
|
||||
Query(#[from] diesel::result::Error),
|
||||
#[error(transparent)]
|
||||
Connection(#[from] diesel::result::ConnectionError),
|
||||
#[error("Logical error: {0}")]
|
||||
Logical(String),
|
||||
}
|
||||
|
||||
impl PendingWrite {
|
||||
async fn commit(&self) -> anyhow::Result<()> {
|
||||
tokio::fs::write(&self.path, &self.bytes).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentState {
|
||||
fn save(&self) -> PendingWrite {
|
||||
PendingWrite {
|
||||
bytes: serde_json::to_vec(self).expect("Serialization error"),
|
||||
path: self.path.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
|
||||
decoded.path = path.to_owned();
|
||||
|
||||
for (tenant_id, tenant) in &mut decoded.tenants {
|
||||
// Backward compat: an old attachments.json from before PR #6251, replace
|
||||
// empty strings with proper defaults.
|
||||
if tenant.tenant_id.is_empty() {
|
||||
tenant.tenant_id = format!("{}", tenant_id);
|
||||
tenant.config = serde_json::to_string(&TenantConfig::default())?;
|
||||
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(decoded)
|
||||
}
|
||||
|
||||
async fn load_or_new(path: &Utf8Path) -> Self {
|
||||
match Self::load(path).await {
|
||||
Ok(s) => {
|
||||
tracing::info!("Loaded state file at {}", path);
|
||||
s
|
||||
}
|
||||
Err(e)
|
||||
if e.downcast_ref::<std::io::Error>()
|
||||
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
|
||||
.unwrap_or(false) =>
|
||||
{
|
||||
tracing::info!("Will create state file at {}", path);
|
||||
Self {
|
||||
tenants: HashMap::new(),
|
||||
path: path.to_owned(),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||
|
||||
impl Persistence {
|
||||
pub async fn new(path: &Utf8Path) -> Self {
|
||||
let state = PersistentState::load_or_new(path).await;
|
||||
pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
|
||||
Self {
|
||||
state: std::sync::Mutex::new(state),
|
||||
database_url,
|
||||
json_path,
|
||||
}
|
||||
}
|
||||
|
||||
/// When registering a node, persist it so that on next start we will be able to
|
||||
/// iterate over known nodes to synchronize their tenant shard states with our observed state.
|
||||
pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
|
||||
// TODO: node persitence will come with database backend
|
||||
Ok(())
|
||||
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
||||
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let database_url = self.database_url.clone();
|
||||
tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
|
||||
// TODO: connection pooling, such as via diesel::r2d2
|
||||
let mut conn = PgConnection::establish(&database_url)?;
|
||||
func(&mut conn)
|
||||
})
|
||||
.await
|
||||
.expect("Task panic")
|
||||
}
|
||||
|
||||
/// At startup, we populate the service's list of nodes, and use this list to call into
|
||||
/// each node to do an initial reconciliation of the state of the world with our in-memory
|
||||
/// observed state.
|
||||
pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
|
||||
let env = LocalEnv::load_config()?;
|
||||
// TODO: node persitence will come with database backend
|
||||
/// When a node is first registered, persist it before using it for anything
|
||||
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
|
||||
let np = node.to_persistent();
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::insert_into(crate::schema::nodes::table)
|
||||
.values(&np)
|
||||
.execute(conn)?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
// XXX hack: enable test_backward_compatibility to work by populating our list of
|
||||
/// At startup, populate the list of nodes which our shards may be placed on
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
|
||||
let nodes: Vec<Node> = self
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::nodes::table
|
||||
.load::<NodePersistence>(conn)?
|
||||
.into_iter()
|
||||
.map(|n| Node {
|
||||
id: NodeId(n.node_id as u64),
|
||||
// At startup we consider a node offline until proven otherwise.
|
||||
availability: NodeAvailability::Offline,
|
||||
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
|
||||
.expect("Bad scheduling policy in DB"),
|
||||
listen_http_addr: n.listen_http_addr,
|
||||
listen_http_port: n.listen_http_port as u16,
|
||||
listen_pg_addr: n.listen_pg_addr,
|
||||
listen_pg_port: n.listen_pg_port as u16,
|
||||
})
|
||||
.collect::<Vec<Node>>())
|
||||
})
|
||||
.await?;
|
||||
|
||||
if nodes.is_empty() {
|
||||
return self.list_nodes_local_env().await;
|
||||
}
|
||||
|
||||
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
|
||||
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
/// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
|
||||
pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
|
||||
// Enable test_backward_compatibility to work by populating our list of
|
||||
// nodes from LocalEnv when it is not present in persistent storage. Otherwise at
|
||||
// first startup in the compat test, we may have shards but no nodes.
|
||||
let mut result = Vec::new();
|
||||
use control_plane::local_env::LocalEnv;
|
||||
let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
|
||||
tracing::info!(
|
||||
"Loaded {} pageserver nodes from LocalEnv",
|
||||
"Loading {} pageserver nodes from LocalEnv",
|
||||
env.pageservers.len()
|
||||
);
|
||||
let mut nodes = Vec::new();
|
||||
for ps_conf in env.pageservers {
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
result.push(Node {
|
||||
let node = Node {
|
||||
id: ps_conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
@@ -141,16 +163,96 @@ impl Persistence {
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
});
|
||||
};
|
||||
|
||||
// Synchronize database with what we learn from LocalEnv
|
||||
self.insert_node(&node).await?;
|
||||
|
||||
nodes.push(node);
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
/// At startup, we populate our map of tenant shards from persistent storage.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
|
||||
let locked = self.state.lock().unwrap();
|
||||
Ok(locked.tenants.values().cloned().collect())
|
||||
/// At startup, load the high level state for shards, such as their config + policy. This will
|
||||
/// be enriched at runtime with state discovered on pageservers.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
let loaded = self
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||
})
|
||||
.await?;
|
||||
|
||||
if loaded.is_empty() {
|
||||
if let Some(path) = &self.json_path {
|
||||
if tokio::fs::try_exists(path)
|
||||
.await
|
||||
.map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
|
||||
{
|
||||
tracing::info!("Importing from legacy JSON format at {path}");
|
||||
return self.list_tenant_shards_json(path).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(loaded)
|
||||
}
|
||||
|
||||
/// Shim for automated compatibility tests: load tenants from a JSON file instead of database
|
||||
pub(crate) async fn list_tenant_shards_json(
|
||||
&self,
|
||||
path: &Utf8Path,
|
||||
) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
let bytes = tokio::fs::read(path)
|
||||
.await
|
||||
.map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
|
||||
|
||||
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
||||
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
||||
for (tenant_id, tenant) in &mut decoded.tenants {
|
||||
// Backward compat: an old attachments.json from before PR #6251, replace
|
||||
// empty strings with proper defaults.
|
||||
if tenant.tenant_id.is_empty() {
|
||||
tenant.tenant_id = tenant_id.to_string();
|
||||
tenant.config = serde_json::to_string(&TenantConfig::default())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
}
|
||||
}
|
||||
|
||||
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
|
||||
|
||||
// Synchronize database with what is in the JSON file
|
||||
self.insert_tenant_shards(tenants.clone()).await?;
|
||||
|
||||
Ok(tenants)
|
||||
}
|
||||
|
||||
/// For use in testing environments, where we dump out JSON on shutdown.
|
||||
pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
|
||||
let Some(path) = &self.json_path else {
|
||||
anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
|
||||
};
|
||||
tracing::info!("Writing state to {path}...");
|
||||
let tenants = self.list_tenant_shards().await?;
|
||||
let mut tenants_map = HashMap::new();
|
||||
for tsp in tenants {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
|
||||
tenants_map.insert(tenant_shard_id, tsp);
|
||||
}
|
||||
let json = serde_json::to_string(&JsonPersistence {
|
||||
tenants: tenants_map,
|
||||
})?;
|
||||
|
||||
tokio::fs::write(path, &json).await?;
|
||||
tracing::info!("Wrote {} bytes to {path}...", json.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tenants must be persisted before we schedule them for the first time. This enables us
|
||||
@@ -158,24 +260,77 @@ impl Persistence {
|
||||
pub(crate) async fn insert_tenant_shards(
|
||||
&self,
|
||||
shards: Vec<TenantShardPersistence>,
|
||||
) -> anyhow::Result<()> {
|
||||
let write = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
for shard in shards {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(shard.shard_number as u8),
|
||||
shard_count: ShardCount(shard.shard_count as u8),
|
||||
};
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
for tenant in &shards {
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(tenant)
|
||||
.execute(conn)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
locked.tenants.insert(tenant_shard_id, shard);
|
||||
}
|
||||
locked.save()
|
||||
};
|
||||
/// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
|
||||
/// the tenant from memory on this server.
|
||||
#[allow(unused)]
|
||||
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||
.execute(conn)?;
|
||||
|
||||
write.commit().await?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
Ok(())
|
||||
/// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
|
||||
/// batched increment of the generations of all tenants whose generation_pageserver is equal to
|
||||
/// the node that called /re-attach.
|
||||
#[tracing::instrument(skip_all, fields(node_id))]
|
||||
pub(crate) async fn re_attach(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_conn(move |conn| {
|
||||
let rows_updated = diesel::update(tenant_shards)
|
||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||
.set(generation.eq(generation + 1))
|
||||
.execute(conn)?;
|
||||
|
||||
tracing::info!("Incremented {} tenants' generations", rows_updated);
|
||||
|
||||
// TODO: UPDATE+SELECT in one query
|
||||
|
||||
let updated = tenant_shards
|
||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||
.select(TenantShardPersistence::as_select())
|
||||
.load(conn)?;
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let mut result = HashMap::new();
|
||||
for tsp in updated {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
|
||||
@@ -184,49 +339,48 @@ impl Persistence {
|
||||
pub(crate) async fn increment_generation(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: Option<NodeId>,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<Generation> {
|
||||
let (write, gen) = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||
anyhow::bail!("Tried to increment generation of unknown shard");
|
||||
};
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_conn(move |conn| {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.set((
|
||||
generation.eq(generation + 1),
|
||||
generation_pageserver.eq(node_id.0 as i64),
|
||||
))
|
||||
// TODO: only returning() the generation column
|
||||
.returning(TenantShardPersistence::as_returning())
|
||||
.get_result(conn)?;
|
||||
|
||||
// If we're called with a None pageserver, we need only update the generation
|
||||
// record to disassociate it with this pageserver, not actually increment the number, as
|
||||
// the increment is guaranteed to happen the next time this tenant is attached.
|
||||
if node_id.is_some() {
|
||||
shard.generation += 1;
|
||||
}
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
shard.generation_pageserver = node_id;
|
||||
let gen = Generation::new(shard.generation);
|
||||
(locked.save(), gen)
|
||||
};
|
||||
|
||||
write.commit().await?;
|
||||
Ok(gen)
|
||||
Ok(Generation::new(updated.generation as u32))
|
||||
}
|
||||
|
||||
pub(crate) async fn re_attach(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
|
||||
let (write, result) = {
|
||||
let mut result = HashMap::new();
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
|
||||
if shard.generation_pageserver == Some(node_id) {
|
||||
shard.generation += 1;
|
||||
result.insert(*tenant_shard_id, Generation::new(shard.generation));
|
||||
}
|
||||
}
|
||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.set((
|
||||
generation_pageserver.eq(i64::MAX),
|
||||
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
|
||||
(locked.save(), result)
|
||||
};
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
write.commit().await?;
|
||||
Ok(result)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO: when we start shard splitting, we must durably mark the tenant so that
|
||||
@@ -246,7 +400,8 @@ impl Persistence {
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
pub(crate) struct TenantShardPersistence {
|
||||
#[serde(default)]
|
||||
pub(crate) tenant_id: String,
|
||||
@@ -257,16 +412,28 @@ pub(crate) struct TenantShardPersistence {
|
||||
#[serde(default)]
|
||||
pub(crate) shard_stripe_size: i32,
|
||||
|
||||
// Currently attached pageserver
|
||||
#[serde(rename = "pageserver")]
|
||||
pub(crate) generation_pageserver: Option<NodeId>,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
pub(crate) generation: u32,
|
||||
pub(crate) generation: i32,
|
||||
|
||||
// Currently attached pageserver
|
||||
#[serde(rename = "pageserver")]
|
||||
pub(crate) generation_pageserver: i64,
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) placement_policy: String,
|
||||
#[serde(default)]
|
||||
pub(crate) config: String,
|
||||
}
|
||||
|
||||
/// Parts of [`crate::node::Node`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
|
||||
#[diesel(table_name = crate::schema::nodes)]
|
||||
pub(crate) struct NodePersistence {
|
||||
pub(crate) node_id: i64,
|
||||
pub(crate) scheduling_policy: String,
|
||||
pub(crate) listen_http_addr: String,
|
||||
pub(crate) listen_http_port: i32,
|
||||
pub(crate) listen_pg_addr: String,
|
||||
pub(crate) listen_pg_port: i32,
|
||||
}
|
||||
|
||||
@@ -296,7 +296,7 @@ impl Reconciler {
|
||||
// Increment generation before attaching to new pageserver
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, Some(dest_ps_id))
|
||||
.increment_generation(self.tenant_shard_id, dest_ps_id)
|
||||
.await?;
|
||||
|
||||
let dest_conf = build_location_config(
|
||||
@@ -395,7 +395,7 @@ impl Reconciler {
|
||||
// as locations with unknown (None) observed state.
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, Some(node_id))
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
wanted_conf.generation = self.generation.into();
|
||||
tracing::info!("Observed configuration requires update.");
|
||||
|
||||
27
control_plane/attachment_service/src/schema.rs
Normal file
27
control_plane/attachment_service/src/schema.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
// @generated automatically by Diesel CLI.
|
||||
|
||||
diesel::table! {
|
||||
nodes (node_id) {
|
||||
node_id -> Int8,
|
||||
scheduling_policy -> Varchar,
|
||||
listen_http_addr -> Varchar,
|
||||
listen_http_port -> Int4,
|
||||
listen_pg_addr -> Varchar,
|
||||
listen_pg_port -> Int4,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
tenant_shards (tenant_id, shard_number, shard_count) {
|
||||
tenant_id -> Varchar,
|
||||
shard_number -> Int4,
|
||||
shard_count -> Int4,
|
||||
shard_stripe_size -> Int4,
|
||||
generation -> Int4,
|
||||
generation_pageserver -> Int8,
|
||||
placement_policy -> Varchar,
|
||||
config -> Text,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
|
||||
@@ -11,6 +11,7 @@ use control_plane::attachment_service::{
|
||||
TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
use diesel::result::DatabaseErrorKind;
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::{
|
||||
control_api::{
|
||||
@@ -26,6 +27,7 @@ use pageserver_api::{
|
||||
};
|
||||
use pageserver_client::mgmt_api;
|
||||
use utils::{
|
||||
completion::Barrier,
|
||||
generation::Generation,
|
||||
http::error::ApiError,
|
||||
id::{NodeId, TenantId},
|
||||
@@ -35,7 +37,7 @@ use utils::{
|
||||
use crate::{
|
||||
compute_hook::ComputeHook,
|
||||
node::Node,
|
||||
persistence::{Persistence, TenantShardPersistence},
|
||||
persistence::{DatabaseError, Persistence, TenantShardPersistence},
|
||||
scheduler::Scheduler,
|
||||
tenant_state::{
|
||||
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
|
||||
@@ -46,6 +48,10 @@ use crate::{
|
||||
|
||||
const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
/// How long [`Service::startup_reconcile`] is allowed to take before it should give
|
||||
/// up on unresponsive pageservers and proceed.
|
||||
pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
struct ServiceState {
|
||||
tenants: BTreeMap<TenantShardId, TenantState>,
|
||||
@@ -79,10 +85,27 @@ pub struct Config {
|
||||
pub jwt_token: Option<String>,
|
||||
}
|
||||
|
||||
impl From<DatabaseError> for ApiError {
|
||||
fn from(err: DatabaseError) -> ApiError {
|
||||
match err {
|
||||
DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
|
||||
// FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
|
||||
DatabaseError::Connection(_e) => ApiError::ShuttingDown,
|
||||
DatabaseError::Logical(reason) => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(reason))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Service {
|
||||
inner: Arc<std::sync::RwLock<ServiceState>>,
|
||||
config: Config,
|
||||
persistence: Arc<Persistence>,
|
||||
|
||||
/// This waits for initial reconciliation with pageservers to complete. Until this barrier
|
||||
/// passes, it isn't safe to do any actions that mutate tenants.
|
||||
pub(crate) startup_complete: Barrier,
|
||||
}
|
||||
|
||||
impl From<ReconcileWaitError> for ApiError {
|
||||
@@ -96,77 +119,32 @@ impl From<ReconcileWaitError> for ApiError {
|
||||
}
|
||||
|
||||
impl Service {
|
||||
pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
|
||||
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
tracing::info!("Loading nodes from database...");
|
||||
let mut nodes = persistence.list_nodes().await?;
|
||||
tracing::info!("Loaded {} nodes from database.", nodes.len());
|
||||
|
||||
tracing::info!("Loading shards from database...");
|
||||
let tenant_shard_persistence = persistence.list_tenant_shards().await?;
|
||||
tracing::info!(
|
||||
"Loaded {} shards from database.",
|
||||
tenant_shard_persistence.len()
|
||||
);
|
||||
|
||||
let mut tenants = BTreeMap::new();
|
||||
|
||||
for tsp in tenant_shard_persistence {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
let shard_identity = if tsp.shard_count == 0 {
|
||||
ShardIdentity::unsharded()
|
||||
} else {
|
||||
ShardIdentity::new(
|
||||
ShardNumber(tsp.shard_number as u8),
|
||||
ShardCount(tsp.shard_count as u8),
|
||||
ShardStripeSize(tsp.shard_stripe_size as u32),
|
||||
)?
|
||||
};
|
||||
let new_tenant = TenantState {
|
||||
tenant_shard_id,
|
||||
shard: shard_identity,
|
||||
sequence: Sequence::initial(),
|
||||
// Note that we load generation, but don't care about generation_pageserver. We will either end up finding
|
||||
// our existing attached location and it will match generation_pageserver, or we will attach somewhere new
|
||||
// and update generation_pageserver in the process.
|
||||
generation: Generation::new(tsp.generation),
|
||||
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
|
||||
intent: IntentState::new(),
|
||||
observed: ObservedState::new(),
|
||||
config: serde_json::from_str(&tsp.config).unwrap(),
|
||||
reconciler: None,
|
||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
last_error: Arc::default(),
|
||||
};
|
||||
|
||||
tenants.insert(tenant_shard_id, new_tenant);
|
||||
}
|
||||
pub fn get_config(&self) -> &Config {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
|
||||
/// until this is done.
|
||||
async fn startup_reconcile(&self) {
|
||||
// For all tenant shards, a vector of observed states on nodes (where None means
|
||||
// indeterminate, same as in [`ObservedStateLocation`])
|
||||
let mut observed = HashMap::new();
|
||||
|
||||
let nodes = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
locked.nodes.clone()
|
||||
};
|
||||
|
||||
// TODO: issue these requests concurrently
|
||||
for node in &mut nodes {
|
||||
let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
|
||||
for node in nodes.values() {
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
|
||||
tracing::info!("Scanning shards on node {}...", node.id);
|
||||
match client.list_location_config().await {
|
||||
Err(e) => {
|
||||
tracing::warn!("Could not contact pageserver {} ({e})", node.id);
|
||||
// TODO: be more tolerant, apply a generous 5-10 second timeout
|
||||
// TODO: setting a node to Offline is a dramatic thing to do, and can
|
||||
// prevent neon_local from starting up (it starts this service before
|
||||
// any pageservers are running). It may make sense to give nodes
|
||||
// a Pending state to accomodate this situation, and allow (but deprioritize)
|
||||
// scheduling on Pending nodes.
|
||||
//node.availability = NodeAvailability::Offline;
|
||||
// TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
|
||||
// pageserver is being restarted at the same time as we are
|
||||
}
|
||||
Ok(listing) => {
|
||||
tracing::info!(
|
||||
@@ -174,7 +152,6 @@ impl Service {
|
||||
listing.tenant_shards.len(),
|
||||
node.id
|
||||
);
|
||||
node.availability = NodeAvailability::Active;
|
||||
|
||||
for (tenant_shard_id, conf_opt) in listing.tenant_shards {
|
||||
observed.insert(tenant_shard_id, (node.id, conf_opt));
|
||||
@@ -186,41 +163,46 @@ impl Service {
|
||||
let mut cleanup = Vec::new();
|
||||
|
||||
// Populate intent and observed states for all tenants, based on reported state on pageservers
|
||||
for (tenant_shard_id, (node_id, observed_loc)) in observed {
|
||||
let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
|
||||
cleanup.push((tenant_shard_id, node_id));
|
||||
continue;
|
||||
};
|
||||
let shard_count = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
for (tenant_shard_id, (node_id, observed_loc)) in observed {
|
||||
let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||
cleanup.push((tenant_shard_id, node_id));
|
||||
continue;
|
||||
};
|
||||
|
||||
tenant_state
|
||||
.observed
|
||||
.locations
|
||||
.insert(node_id, ObservedStateLocation { conf: observed_loc });
|
||||
}
|
||||
|
||||
// State of nodes is now frozen, transform to a HashMap.
|
||||
let mut nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
|
||||
|
||||
// Populate each tenant's intent state
|
||||
let mut scheduler = Scheduler::new(&tenants, &nodes);
|
||||
for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
|
||||
tenant_state.intent_from_observed();
|
||||
if let Err(e) = tenant_state.schedule(&mut scheduler) {
|
||||
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
|
||||
// not enough pageservers are available. The tenant may well still be available
|
||||
// to clients.
|
||||
tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
|
||||
tenant_state
|
||||
.observed
|
||||
.locations
|
||||
.insert(node_id, ObservedStateLocation { conf: observed_loc });
|
||||
}
|
||||
}
|
||||
|
||||
// Populate each tenant's intent state
|
||||
let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
|
||||
for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
|
||||
tenant_state.intent_from_observed();
|
||||
if let Err(e) = tenant_state.schedule(&mut scheduler) {
|
||||
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
|
||||
// not enough pageservers are available. The tenant may well still be available
|
||||
// to clients.
|
||||
tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
locked.tenants.len()
|
||||
};
|
||||
|
||||
// TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
|
||||
// generation_pageserver in the database.
|
||||
|
||||
// Clean up any tenants that were found on pageservers but are not known to us.
|
||||
for (tenant_shard_id, node_id) in cleanup {
|
||||
// A node reported a tenant_shard_id which is unknown to us: detach it.
|
||||
let node = nodes
|
||||
.get_mut(&node_id)
|
||||
.get(&node_id)
|
||||
.expect("Always exists: only known nodes are scanned");
|
||||
|
||||
let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
match client
|
||||
.location_config(
|
||||
tenant_shard_id,
|
||||
@@ -252,13 +234,80 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
let shard_count = tenants.len();
|
||||
// Finally, now that the service is up and running, launch reconcile operations for any tenants
|
||||
// which require it: under normal circumstances this should only include tenants that were in some
|
||||
// transient state before we restarted.
|
||||
let reconcile_tasks = self.reconcile_all();
|
||||
tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
|
||||
}
|
||||
|
||||
pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
|
||||
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
tracing::info!("Loading nodes from database...");
|
||||
let nodes = persistence.list_nodes().await?;
|
||||
let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
|
||||
tracing::info!("Loaded {} nodes from database.", nodes.len());
|
||||
|
||||
tracing::info!("Loading shards from database...");
|
||||
let tenant_shard_persistence = persistence.list_tenant_shards().await?;
|
||||
tracing::info!(
|
||||
"Loaded {} shards from database.",
|
||||
tenant_shard_persistence.len()
|
||||
);
|
||||
|
||||
let mut tenants = BTreeMap::new();
|
||||
|
||||
for tsp in tenant_shard_persistence {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
};
|
||||
let shard_identity = if tsp.shard_count == 0 {
|
||||
ShardIdentity::unsharded()
|
||||
} else {
|
||||
ShardIdentity::new(
|
||||
ShardNumber(tsp.shard_number as u8),
|
||||
ShardCount(tsp.shard_count as u8),
|
||||
ShardStripeSize(tsp.shard_stripe_size as u32),
|
||||
)?
|
||||
};
|
||||
|
||||
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
|
||||
// it with what we can infer: the node for which a generation was most recently issued.
|
||||
let mut intent = IntentState::new();
|
||||
if tsp.generation_pageserver != i64::MAX {
|
||||
intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
|
||||
}
|
||||
|
||||
let new_tenant = TenantState {
|
||||
tenant_shard_id,
|
||||
shard: shard_identity,
|
||||
sequence: Sequence::initial(),
|
||||
generation: Generation::new(tsp.generation as u32),
|
||||
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
|
||||
intent,
|
||||
observed: ObservedState::new(),
|
||||
config: serde_json::from_str(&tsp.config).unwrap(),
|
||||
reconciler: None,
|
||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
last_error: Arc::default(),
|
||||
};
|
||||
|
||||
tenants.insert(tenant_shard_id, new_tenant);
|
||||
}
|
||||
|
||||
let (startup_completion, startup_complete) = utils::completion::channel();
|
||||
|
||||
let this = Arc::new(Self {
|
||||
inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
|
||||
result_tx, nodes, tenants,
|
||||
))),
|
||||
config,
|
||||
persistence,
|
||||
startup_complete,
|
||||
});
|
||||
|
||||
let result_task_this = this.clone();
|
||||
@@ -316,11 +365,13 @@ impl Service {
|
||||
}
|
||||
});
|
||||
|
||||
// Finally, now that the service is up and running, launch reconcile operations for any tenants
|
||||
// which require it: under normal circumstances this should only include tenants that were in some
|
||||
// transient state before we restarted.
|
||||
let reconcile_tasks = this.reconcile_all();
|
||||
tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
|
||||
let startup_reconcile_this = this.clone();
|
||||
tokio::task::spawn(async move {
|
||||
// Block the [`Service::startup_complete`] barrier until we're done
|
||||
let _completion = startup_completion;
|
||||
|
||||
startup_reconcile_this.startup_reconcile().await
|
||||
});
|
||||
|
||||
Ok(this)
|
||||
}
|
||||
@@ -336,7 +387,6 @@ impl Service {
|
||||
let locked = self.inner.write().unwrap();
|
||||
!locked.tenants.contains_key(&attach_req.tenant_shard_id)
|
||||
};
|
||||
|
||||
if insert {
|
||||
let tsp = TenantShardPersistence {
|
||||
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
|
||||
@@ -344,31 +394,49 @@ impl Service {
|
||||
shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
|
||||
shard_stripe_size: 0,
|
||||
generation: 0,
|
||||
generation_pageserver: None,
|
||||
generation_pageserver: i64::MAX,
|
||||
placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
|
||||
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||
};
|
||||
|
||||
self.persistence.insert_tenant_shards(vec![tsp]).await?;
|
||||
match self.persistence.insert_tenant_shards(vec![tsp]).await {
|
||||
Err(e) => match e {
|
||||
DatabaseError::Query(diesel::result::Error::DatabaseError(
|
||||
DatabaseErrorKind::UniqueViolation,
|
||||
_,
|
||||
)) => {
|
||||
tracing::info!(
|
||||
"Raced with another request to insert tenant {}",
|
||||
attach_req.tenant_shard_id
|
||||
)
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
},
|
||||
Ok(()) => {
|
||||
tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
locked.tenants.insert(
|
||||
attach_req.tenant_shard_id,
|
||||
TenantState::new(
|
||||
attach_req.tenant_shard_id,
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Single,
|
||||
),
|
||||
);
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
locked.tenants.insert(
|
||||
attach_req.tenant_shard_id,
|
||||
TenantState::new(
|
||||
attach_req.tenant_shard_id,
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Single,
|
||||
),
|
||||
);
|
||||
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let new_generation = if attach_req.node_id.is_some() {
|
||||
let new_generation = if let Some(req_node_id) = attach_req.node_id {
|
||||
Some(
|
||||
self.persistence
|
||||
.increment_generation(attach_req.tenant_shard_id, attach_req.node_id)
|
||||
.increment_generation(attach_req.tenant_shard_id, req_node_id)
|
||||
.await?,
|
||||
)
|
||||
} else {
|
||||
self.persistence.detach(attach_req.tenant_shard_id).await?;
|
||||
None
|
||||
};
|
||||
|
||||
@@ -380,6 +448,11 @@ impl Service {
|
||||
|
||||
if let Some(new_generation) = new_generation {
|
||||
tenant_state.generation = new_generation;
|
||||
} else {
|
||||
// This is a detach notification. We must update placement policy to avoid re-attaching
|
||||
// during background scheduling/reconciliation, or during attachment service restart.
|
||||
assert!(attach_req.node_id.is_none());
|
||||
tenant_state.policy = PlacementPolicy::Detached;
|
||||
}
|
||||
|
||||
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||
@@ -407,6 +480,7 @@ impl Service {
|
||||
"attach_hook: tenant {} set generation {:?}, pageserver {}",
|
||||
attach_req.tenant_shard_id,
|
||||
tenant_state.generation,
|
||||
// TODO: this is an odd number of 0xf's
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||
);
|
||||
|
||||
@@ -499,6 +573,14 @@ impl Service {
|
||||
id: req_tenant.id,
|
||||
valid,
|
||||
});
|
||||
} else {
|
||||
// After tenant deletion, we may approve any validation. This avoids
|
||||
// spurious warnings on the pageserver if it has pending LSN updates
|
||||
// at the point a deletion happens.
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
response
|
||||
@@ -554,7 +636,7 @@ impl Service {
|
||||
shard_count: tenant_shard_id.shard_count.0 as i32,
|
||||
shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
|
||||
generation: 0,
|
||||
generation_pageserver: None,
|
||||
generation_pageserver: i64::MAX,
|
||||
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
|
||||
config: serde_json::to_string(&create_req.config).unwrap(),
|
||||
})
|
||||
@@ -868,7 +950,6 @@ impl Service {
|
||||
} else {
|
||||
let old_attached = shard.intent.attached;
|
||||
|
||||
shard.intent.attached = Some(migrate_req.node_id);
|
||||
match shard.policy {
|
||||
PlacementPolicy::Single => {
|
||||
shard.intent.secondary.clear();
|
||||
@@ -882,7 +963,13 @@ impl Service {
|
||||
shard.intent.secondary.push(old_attached);
|
||||
}
|
||||
}
|
||||
PlacementPolicy::Detached => {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
|
||||
)))
|
||||
}
|
||||
}
|
||||
shard.intent.attached = Some(migrate_req.node_id);
|
||||
|
||||
tracing::info!("Migrating: new intent {:?}", shard.intent);
|
||||
shard.sequence = shard.sequence.next();
|
||||
@@ -955,10 +1042,7 @@ impl Service {
|
||||
availability: NodeAvailability::Active,
|
||||
};
|
||||
// TODO: idempotency if the node already exists in the database
|
||||
self.persistence
|
||||
.insert_node(&new_node)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
self.persistence.insert_node(&new_node).await?;
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let mut new_nodes = (*locked.nodes).clone();
|
||||
|
||||
@@ -312,6 +312,18 @@ impl TenantState {
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Detached => {
|
||||
// Should have no attached or secondary pageservers
|
||||
if self.intent.attached.is_some() {
|
||||
self.intent.attached = None;
|
||||
modified = true;
|
||||
}
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.secondary.clear();
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if modified {
|
||||
|
||||
@@ -1,15 +1,21 @@
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use diesel::{
|
||||
backend::Backend,
|
||||
query_builder::{AstPass, QueryFragment, QueryId},
|
||||
Connection, PgConnection, QueryResult, RunQueryDsl,
|
||||
};
|
||||
use diesel_migrations::{HarnessWithOutput, MigrationHarness};
|
||||
use hyper::Method;
|
||||
use pageserver_api::{
|
||||
models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use reqwest::Method;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{path::PathBuf, process::Child, str::FromStr};
|
||||
use std::{env, str::FromStr};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
@@ -19,14 +25,17 @@ use utils::{
|
||||
pub struct AttachmentService {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: PathBuf,
|
||||
path: Utf8PathBuf,
|
||||
jwt_token: Option<String>,
|
||||
public_key_path: Option<Utf8PathBuf>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "attachment_service";
|
||||
|
||||
const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -169,7 +178,9 @@ pub struct TenantShardMigrateResponse {}
|
||||
|
||||
impl AttachmentService {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = env.base_data_dir.join("attachments.json");
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachments.json");
|
||||
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
@@ -181,6 +192,13 @@ impl AttachmentService {
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
|
||||
// port, for use by our captive postgres.
|
||||
let postgres_port = listen_url
|
||||
.port()
|
||||
.expect("Control plane API setting should always have a port")
|
||||
+ 1;
|
||||
|
||||
// Assume all pageservers have symmetric auth configuration: this service
|
||||
// expects to use one JWT token to talk to all of them.
|
||||
let ps_conf = env
|
||||
@@ -209,6 +227,7 @@ impl AttachmentService {
|
||||
listen,
|
||||
jwt_token,
|
||||
public_key_path,
|
||||
postgres_port,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
@@ -220,13 +239,214 @@ impl AttachmentService {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<Child> {
|
||||
let path_str = self.path.to_string_lossy();
|
||||
/// PIDFile for the postgres instance used to store attachment service state
|
||||
fn postgres_pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join("attachment_service_postgres.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
let mut args = vec!["-l", &self.listen, "-p", &path_str]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
/// In order to access database migrations, we need to find the Neon source tree
|
||||
async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
// We assume that either prd or our binary is in the source tree. The former is usually
|
||||
// true for automated test runners, the latter is usually true for developer workstations. Often
|
||||
// both are true, which is fine.
|
||||
let candidate_start_points = [
|
||||
// Current working directory
|
||||
Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
|
||||
// Directory containing the binary we're running inside
|
||||
Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
|
||||
];
|
||||
|
||||
// For each candidate start point, search through ancestors looking for a neon.git source tree root
|
||||
for start_point in &candidate_start_points {
|
||||
// Start from the build dir: assumes we are running out of a built neon source tree
|
||||
for path in start_point.ancestors() {
|
||||
// A crude approximation: the root of the source tree is whatever contains a "control_plane"
|
||||
// subdirectory.
|
||||
let control_plane = path.join("control_plane");
|
||||
if tokio::fs::try_exists(&control_plane).await? {
|
||||
return Ok(path.to_owned());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fall-through
|
||||
Err(anyhow::anyhow!(
|
||||
"Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
|
||||
))
|
||||
}
|
||||
|
||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
||||
///
|
||||
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// to other versions if that one isn't found. Some automated tests create circumstances
|
||||
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
|
||||
|
||||
for v in prefer_versions {
|
||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
||||
if tokio::fs::try_exists(&path).await? {
|
||||
return Ok(path);
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through
|
||||
anyhow::bail!(
|
||||
"Postgres binaries not found in {}",
|
||||
self.env.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
/// Readiness check for our postgres process
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
||||
let bin_path = pg_bin_dir.join("pg_isready");
|
||||
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
|
||||
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
|
||||
|
||||
Ok(exitcode.success())
|
||||
}
|
||||
|
||||
/// Create our database if it doesn't exist, and run migrations.
|
||||
///
|
||||
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
|
||||
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
|
||||
/// who just want to run `cargo neon_local` without knowing about diesel.
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
let database_url = format!(
|
||||
"postgresql://localhost:{}/attachment_service",
|
||||
self.postgres_port
|
||||
);
|
||||
println!("Running attachment service database setup...");
|
||||
fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
|
||||
let base = ::url::Url::parse(database_url).unwrap();
|
||||
let database = base.path_segments().unwrap().last().unwrap().to_owned();
|
||||
let mut new_url = base.join(default_database).unwrap();
|
||||
new_url.set_query(base.query());
|
||||
(database, new_url.into())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CreateDatabaseStatement {
|
||||
db_name: String,
|
||||
}
|
||||
|
||||
impl CreateDatabaseStatement {
|
||||
pub fn new(db_name: &str) -> Self {
|
||||
CreateDatabaseStatement {
|
||||
db_name: db_name.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
|
||||
fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
|
||||
out.push_sql("CREATE DATABASE ");
|
||||
out.push_identifier(&self.db_name)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
|
||||
|
||||
impl QueryId for CreateDatabaseStatement {
|
||||
type QueryId = ();
|
||||
|
||||
const HAS_STATIC_QUERY_ID: bool = false;
|
||||
}
|
||||
if PgConnection::establish(&database_url).is_err() {
|
||||
let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
|
||||
println!("Creating database: {database}");
|
||||
let mut conn = PgConnection::establish(&postgres_url)?;
|
||||
CreateDatabaseStatement::new(&database).execute(&mut conn)?;
|
||||
}
|
||||
let mut conn = PgConnection::establish(&database_url)?;
|
||||
|
||||
let migrations_dir = self
|
||||
.find_source_root()
|
||||
.await?
|
||||
.join("control_plane/attachment_service/migrations");
|
||||
|
||||
let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
|
||||
println!("Running migrations in {}", migrations.path().display());
|
||||
HarnessWithOutput::write_to_stdout(&mut conn)
|
||||
.run_pending_migrations(migrations)
|
||||
.map(|_| ())
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
println!("Migrations complete");
|
||||
|
||||
Ok(database_url)
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the attachment service for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachment_service_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
};
|
||||
|
||||
println!("Starting attachment service database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"start",
|
||||
];
|
||||
|
||||
background_process::start_process(
|
||||
"attachment_service_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
|| self.pg_isready(&pg_bin_dir),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
"-p",
|
||||
self.path.as_ref(),
|
||||
"--database-url",
|
||||
&database_url,
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
}
|
||||
@@ -235,7 +455,7 @@ impl AttachmentService {
|
||||
args.push(format!("--public-key={public_key_path}"));
|
||||
}
|
||||
|
||||
let result = background_process::start_process(
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.attachment_service_bin(),
|
||||
@@ -252,33 +472,50 @@ impl AttachmentService {
|
||||
}
|
||||
},
|
||||
)
|
||||
.await;
|
||||
.await?;
|
||||
|
||||
for ps_conf in &self.env.pageservers {
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
self.node_register(NodeRegisterRequest {
|
||||
node_id: ps_conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
println!("Stopping attachment service database...");
|
||||
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
|
||||
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_stop_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
if !stop_status.success() {
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||
println!("Attachment service data base is already stopped");
|
||||
return Ok(());
|
||||
} else {
|
||||
anyhow::bail!("Failed to stop attachment service database: {stop_status}")
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())
|
||||
}
|
||||
/// Simple HTTP request wrapper for calling into attachment service
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: reqwest::Method,
|
||||
method: hyper::Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> anyhow::Result<RS>
|
||||
@@ -356,7 +593,7 @@ impl AttachmentService {
|
||||
&self,
|
||||
req: TenantCreateRequest,
|
||||
) -> anyhow::Result<TenantCreateResponse> {
|
||||
self.dispatch(Method::POST, "tenant".to_string(), Some(req))
|
||||
self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -413,7 +650,7 @@ impl AttachmentService {
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
self.dispatch(
|
||||
Method::POST,
|
||||
format!("tenant/{tenant_id}/timeline"),
|
||||
format!("v1/tenant/{tenant_id}/timeline"),
|
||||
Some(req),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::io::Write;
|
||||
use std::os::unix::prelude::AsRawFd;
|
||||
use std::os::unix::process::CommandExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Child, Command};
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
use std::{fs, io, thread};
|
||||
|
||||
@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
envs: EI,
|
||||
initial_pid_file: InitialPidFile,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<Child>
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
F: Fn() -> Fut,
|
||||
Fut: std::future::Future<Output = anyhow::Result<bool>>,
|
||||
@@ -98,7 +98,7 @@ where
|
||||
InitialPidFile::Expect(path) => path,
|
||||
};
|
||||
|
||||
let mut spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
let spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
format!("Could not spawn {process_name}, see console output and log files for details.")
|
||||
})?;
|
||||
let pid = spawned_process.id();
|
||||
@@ -106,12 +106,26 @@ where
|
||||
i32::try_from(pid)
|
||||
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
|
||||
);
|
||||
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
||||
let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
|
||||
println!("SIGKILL & wait the started process");
|
||||
(|| {
|
||||
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
|
||||
spawned_process.kill().context("SIGKILL child")?;
|
||||
spawned_process.wait().context("wait() for child process")?;
|
||||
anyhow::Ok(())
|
||||
})()
|
||||
.with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check).await {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started, pid: {pid}");
|
||||
return Ok(spawned_process);
|
||||
println!("\n{process_name} started and passed status check, pid: {pid}");
|
||||
// leak the child process, it'll outlive this neon_local invocation
|
||||
drop(scopeguard::ScopeGuard::into_inner(spawned_process));
|
||||
return Ok(());
|
||||
}
|
||||
Ok(false) => {
|
||||
if retries == NOTICE_AFTER_RETRIES {
|
||||
@@ -126,16 +140,15 @@ where
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} failed to start: {e:#}");
|
||||
if let Err(e) = spawned_process.kill() {
|
||||
println!("Could not stop {process_name} subprocess: {e:#}")
|
||||
};
|
||||
println!("error starting process {process_name:?}: {e:#}");
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
|
||||
anyhow::bail!(
|
||||
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
|
||||
);
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
|
||||
@@ -135,7 +135,7 @@ fn main() -> Result<()> {
|
||||
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
|
||||
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
|
||||
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
||||
"stop" => handle_stop_all(sub_args, &env),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
|
||||
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
|
||||
@@ -1056,8 +1056,9 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.start(&pageserver_config_overrides(subcommand_args), *register)
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
@@ -1086,24 +1087,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("migrate", subcommand_args)) => {
|
||||
let pageserver = get_pageserver(env, subcommand_args)?;
|
||||
//TODO what shutdown strategy should we use here?
|
||||
if let Err(e) = pageserver.stop(false) {
|
||||
eprintln!("pageserver stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.start(&pageserver_config_overrides(subcommand_args), false)
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
@@ -1161,7 +1145,7 @@ async fn handle_attachment_service(
|
||||
.map(|s| s.as_str())
|
||||
== Some("immediate");
|
||||
|
||||
if let Err(e) = svc.stop(immediate) {
|
||||
if let Err(e) = svc.stop(immediate).await {
|
||||
eprintln!("stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1257,7 +1241,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.start().await {
|
||||
eprintln!("attachment_service start failed: {:#}", e);
|
||||
try_stop_all(env, true);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@@ -1265,11 +1249,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(sub_match))
|
||||
.start(&pageserver_config_overrides(sub_match), true)
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
try_stop_all(env, true);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@@ -1278,23 +1262,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
if let Err(e) = safekeeper.start(vec![]).await {
|
||||
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
|
||||
try_stop_all(env, false);
|
||||
try_stop_all(env, false).await;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let immediate =
|
||||
sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||
|
||||
try_stop_all(env, immediate);
|
||||
try_stop_all(env, immediate).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
// Stop all endpoints
|
||||
match ComputeControlPlane::load(env.clone()) {
|
||||
Ok(cplane) => {
|
||||
@@ -1329,7 +1313,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
|
||||
if env.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.stop(immediate) {
|
||||
if let Err(e) = attachment_service.stop(immediate).await {
|
||||
eprintln!("attachment service stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
@@ -1549,7 +1533,11 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("status"))
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
.arg(pageserver_config_args.clone()).arg(Arg::new("register")
|
||||
.long("register")
|
||||
.default_value("true").required(false)
|
||||
.value_parser(value_parser!(bool))
|
||||
.value_name("register"))
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local pageserver")
|
||||
|
||||
@@ -57,7 +57,7 @@ use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
|
||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
|
||||
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
|
||||
|
||||
// contents of a endpoint.json file
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
@@ -70,6 +70,7 @@ pub struct EndpointConf {
|
||||
http_port: u16,
|
||||
pg_version: u32,
|
||||
skip_pg_catalog_updates: bool,
|
||||
features: Vec<ComputeFeature>,
|
||||
}
|
||||
|
||||
//
|
||||
@@ -140,6 +141,7 @@ impl ComputeControlPlane {
|
||||
// with this we basically test a case of waking up an idle compute, where
|
||||
// we also skip catalog updates in the cloud.
|
||||
skip_pg_catalog_updates: true,
|
||||
features: vec![],
|
||||
});
|
||||
|
||||
ep.create_endpoint_dir()?;
|
||||
@@ -154,6 +156,7 @@ impl ComputeControlPlane {
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates: true,
|
||||
features: vec![],
|
||||
})?,
|
||||
)?;
|
||||
std::fs::write(
|
||||
@@ -215,6 +218,9 @@ pub struct Endpoint {
|
||||
|
||||
// Optimizations
|
||||
skip_pg_catalog_updates: bool,
|
||||
|
||||
// Feature flags
|
||||
features: Vec<ComputeFeature>,
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
@@ -244,6 +250,7 @@ impl Endpoint {
|
||||
tenant_id: conf.tenant_id,
|
||||
pg_version: conf.pg_version,
|
||||
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
|
||||
features: conf.features,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -431,7 +438,7 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
|
||||
// TODO use background_process::stop_process instead
|
||||
// TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||
@@ -519,7 +526,7 @@ impl Endpoint {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: vec![],
|
||||
features: self.features.clone(),
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
@@ -576,9 +583,21 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
let child = cmd.spawn()?;
|
||||
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
||||
let child = scopeguard::guard(child, |mut child| {
|
||||
println!("SIGKILL & wait the started process");
|
||||
(|| {
|
||||
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned
|
||||
child.kill().context("SIGKILL child")?;
|
||||
child.wait().context("wait() for child process")?;
|
||||
anyhow::Ok(())
|
||||
})()
|
||||
.with_context(|| format!("scopeguard kill&wait child {child:?}"))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
// Write down the pid so we can wait for it when we want to stop
|
||||
// TODO use background_process::start_process instead
|
||||
// TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
|
||||
let pid = child.id();
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
std::fs::write(pidfile_path, pid.to_string())?;
|
||||
@@ -627,6 +646,9 @@ impl Endpoint {
|
||||
std::thread::sleep(ATTEMPT_INTERVAL);
|
||||
}
|
||||
|
||||
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
|
||||
drop(scopeguard::ScopeGuard::into_inner(child));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -223,7 +223,11 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
pub fn attachment_service_bin(&self) -> PathBuf {
|
||||
self.neon_distrib_dir.join("attachment_service")
|
||||
// Irrespective of configuration, attachment service binary is always
|
||||
// run from the same location as neon_local. This means that for compatibility
|
||||
// tests that run old pageserver/safekeeper, they still run latest attachment service.
|
||||
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
|
||||
neon_local_bin_dir.join("attachment_service")
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> PathBuf {
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::io;
|
||||
use std::io::Write;
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Child, Command};
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
@@ -30,6 +30,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
|
||||
use crate::local_env::PageServerConf;
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
|
||||
@@ -161,8 +162,8 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
|
||||
self.start_node(config_overrides, false).await
|
||||
pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, false, register).await
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
@@ -207,7 +208,8 @@ impl PageServerNode {
|
||||
&self,
|
||||
config_overrides: &[&str],
|
||||
update_config: bool,
|
||||
) -> anyhow::Result<Child> {
|
||||
register: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
@@ -244,7 +246,26 @@ impl PageServerNode {
|
||||
}
|
||||
},
|
||||
)
|
||||
.await
|
||||
.await?;
|
||||
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pageserver_basic_args<'a>(
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
//! ```
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Child;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -104,7 +103,7 @@ impl SafekeeperNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
|
||||
9
diesel.toml
Normal file
9
diesel.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
# For documentation on how to configure this file,
|
||||
# see https://diesel.rs/guides/configuring-diesel-cli
|
||||
|
||||
[print_schema]
|
||||
file = "control_plane/attachment_service/src/schema.rs"
|
||||
custom_type_derives = ["diesel::query_builder::QueryId"]
|
||||
|
||||
[migrations_directory]
|
||||
dir = "control_plane/attachment_service/migrations"
|
||||
@@ -90,6 +90,9 @@ pub enum ComputeFeature {
|
||||
/// track short-lived connections as user activity.
|
||||
ActivityMonitorExperimental,
|
||||
|
||||
/// Enable running migrations
|
||||
Migrations,
|
||||
|
||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
/// `parse_unknown_features()` for more details.
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, BE};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::{fmt, ops::Range};
|
||||
|
||||
use crate::reltag::{BlockNumber, RelTag};
|
||||
use crate::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
|
||||
/// Key used in the Repository kv-store.
|
||||
///
|
||||
@@ -143,12 +145,390 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
// Layout of the Key address space
|
||||
//
|
||||
// The Key struct, used to address the underlying key-value store, consists of
|
||||
// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
|
||||
// all the data and metadata keys into those 18 bytes.
|
||||
//
|
||||
// Principles for the mapping:
|
||||
//
|
||||
// - Things that are often accessed or modified together, should be close to
|
||||
// each other in the key space. For example, if a relation is extended by one
|
||||
// block, we create a new key-value pair for the block data, and update the
|
||||
// relation size entry. Because of that, the RelSize key comes after all the
|
||||
// RelBlocks of a relation: the RelSize and the last RelBlock are always next
|
||||
// to each other.
|
||||
//
|
||||
// The key space is divided into four major sections, identified by the first
|
||||
// byte, and the form a hierarchy:
|
||||
//
|
||||
// 00 Relation data and metadata
|
||||
//
|
||||
// DbDir () -> (dbnode, spcnode)
|
||||
// Filenodemap
|
||||
// RelDir -> relnode forknum
|
||||
// RelBlocks
|
||||
// RelSize
|
||||
//
|
||||
// 01 SLRUs
|
||||
//
|
||||
// SlruDir kind
|
||||
// SlruSegBlocks segno
|
||||
// SlruSegSize
|
||||
//
|
||||
// 02 pg_twophase
|
||||
//
|
||||
// 03 misc
|
||||
// Controlfile
|
||||
// checkpoint
|
||||
// pg_version
|
||||
//
|
||||
// 04 aux files
|
||||
//
|
||||
// Below is a full list of the keyspace allocation:
|
||||
//
|
||||
// DbDir:
|
||||
// 00 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Filenodemap:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000000
|
||||
//
|
||||
// RelDir:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
|
||||
//
|
||||
// RelBlock:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK BLKNUM
|
||||
//
|
||||
// RelSize:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
|
||||
//
|
||||
// SlruDir:
|
||||
// 01 kind 00000000 00000000 00 00000000
|
||||
//
|
||||
// SlruSegBlock:
|
||||
// 01 kind 00000001 SEGNO 00 BLKNUM
|
||||
//
|
||||
// SlruSegSize:
|
||||
// 01 kind 00000001 SEGNO 00 FFFFFFFF
|
||||
//
|
||||
// TwoPhaseDir:
|
||||
// 02 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// TwoPhaseFile:
|
||||
// 02 00000000 00000000 00000000 00 XID
|
||||
//
|
||||
// ControlFile:
|
||||
// 03 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Checkpoint:
|
||||
// 03 00000000 00000000 00000000 00 00000001
|
||||
//
|
||||
// AuxFiles:
|
||||
// 03 00000000 00000000 00000000 00 00000002
|
||||
//
|
||||
|
||||
//-- Section 01: relation data and metadata
|
||||
|
||||
pub const DBDIR_KEY: Key = Key {
|
||||
field1: 0x00,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
#[inline(always)]
|
||||
pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0xffffffff,
|
||||
field5: 0xff,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 1,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_size_to_key(rel: RelTag) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_key_range(rel: RelTag) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum + 1,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 02: SLRUs
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_dir_to_key(kind: SlruKind) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
|
||||
let field2 = match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
};
|
||||
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 1,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 03: pg_twophase
|
||||
|
||||
pub const TWOPHASEDIR_KEY: Key = Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
#[inline(always)]
|
||||
pub fn twophase_file_key(xid: TransactionId) -> Key {
|
||||
Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: xid,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
|
||||
let (next_xid, overflowed) = xid.overflowing_add(1);
|
||||
|
||||
Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: xid,
|
||||
}..Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: u8::from(overflowed),
|
||||
field6: next_xid,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 03: Control file
|
||||
pub const CONTROLFILE_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
pub const CHECKPOINT_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 1,
|
||||
};
|
||||
|
||||
pub const AUX_FILES_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 2,
|
||||
};
|
||||
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(key: Key) -> bool {
|
||||
key != AUX_FILES_KEY
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_vm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00
|
||||
&& key.field4 != 0
|
||||
&& key.field5 == VISIBILITYMAP_FORKNUM
|
||||
&& key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x01 => {
|
||||
let kind = match key.field2 {
|
||||
0x00 => SlruKind::Clog,
|
||||
0x01 => SlruKind::MultiXactMembers,
|
||||
0x02 => SlruKind::MultiXactOffsets,
|
||||
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
|
||||
};
|
||||
let segno = key.field4;
|
||||
let blknum = key.field6;
|
||||
|
||||
(kind, segno, blknum)
|
||||
}
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_slru_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x01 // SLRU-related
|
||||
&& key.field3 == 0x00000001 // but not SlruDir
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x00 => (
|
||||
|
||||
@@ -104,6 +104,7 @@ pub struct KeySpaceAccum {
|
||||
accum: Option<Range<Key>>,
|
||||
|
||||
ranges: Vec<Range<Key>>,
|
||||
size: u64,
|
||||
}
|
||||
|
||||
impl KeySpaceAccum {
|
||||
@@ -111,6 +112,7 @@ impl KeySpaceAccum {
|
||||
Self {
|
||||
accum: None,
|
||||
ranges: Vec::new(),
|
||||
size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,6 +123,8 @@ impl KeySpaceAccum {
|
||||
|
||||
#[inline(always)]
|
||||
pub fn add_range(&mut self, range: Range<Key>) {
|
||||
self.size += key_range_size(&range) as u64;
|
||||
|
||||
match self.accum.as_mut() {
|
||||
Some(accum) => {
|
||||
if range.start == accum.end {
|
||||
@@ -146,6 +150,23 @@ impl KeySpaceAccum {
|
||||
ranges: self.ranges,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn consume_keyspace(&mut self) -> KeySpace {
|
||||
if let Some(accum) = self.accum.take() {
|
||||
self.ranges.push(accum);
|
||||
}
|
||||
|
||||
let mut prev_accum = KeySpaceAccum::new();
|
||||
std::mem::swap(self, &mut prev_accum);
|
||||
|
||||
KeySpace {
|
||||
ranges: prev_accum.ranges,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -254,6 +275,30 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyspace_consume() {
|
||||
let ranges = vec![kr(0..10), kr(20..35), kr(40..45)];
|
||||
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
for range in &ranges {
|
||||
accum.add_range(range.clone());
|
||||
}
|
||||
|
||||
let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
|
||||
assert_eq!(accum.size(), expected_size);
|
||||
|
||||
assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
|
||||
assert_eq!(accum.size(), 0);
|
||||
|
||||
assert_ks_eq(&accum.consume_keyspace(), vec![]);
|
||||
assert_eq!(accum.size(), 0);
|
||||
|
||||
for range in &ranges {
|
||||
accum.add_range(range.clone());
|
||||
}
|
||||
assert_ks_eq(&accum.to_keyspace(), ranges);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyspace_add_range() {
|
||||
// two separate ranges
|
||||
|
||||
@@ -111,7 +111,19 @@ impl RelTag {
|
||||
/// These files are divided into segments, which are divided into
|
||||
/// pages of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
Hash,
|
||||
Serialize,
|
||||
Deserialize,
|
||||
PartialEq,
|
||||
Eq,
|
||||
PartialOrd,
|
||||
Ord,
|
||||
strum_macros::EnumIter,
|
||||
)]
|
||||
pub enum SlruKind {
|
||||
Clog,
|
||||
MultiXactMembers,
|
||||
|
||||
@@ -329,8 +329,8 @@ impl CheckPoint {
|
||||
///
|
||||
/// Returns 'true' if the XID was updated.
|
||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
// nextXid should nw greater than any XID in WAL, so increment provided XID and check for wraparround.
|
||||
let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
// nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
|
||||
let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
|
||||
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
|
||||
new_xid =
|
||||
|
||||
@@ -15,11 +15,7 @@ aws-sdk-s3.workspace = true
|
||||
aws-credential-types.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
hyper = { workspace = true, features = [] }
|
||||
hyper-util = { workspace = true, features = [] }
|
||||
http = { workspace = true, features = [] }
|
||||
http-body = { workspace = true, features = [] }
|
||||
http-body-util = { workspace = true, features = [] }
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Result;
|
||||
@@ -23,6 +24,7 @@ use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use http_types::{StatusCode, Url};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
@@ -183,7 +185,6 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for AzureBlobStorage {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -371,6 +372,20 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
copy_status = status;
|
||||
}
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
// TODO use Azure point in time recovery feature for this
|
||||
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
|
||||
Err(anyhow::anyhow!(
|
||||
"time travel recovery for azure blob storage is not implemented"
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
|
||||
@@ -25,6 +25,7 @@ use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use toml_edit::Item;
|
||||
use tracing::info;
|
||||
|
||||
@@ -142,7 +143,7 @@ pub struct Listing {
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
#[allow(async_fn_in_trait)]
|
||||
pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||
@@ -210,6 +211,15 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
|
||||
/// Copy a remote object inside a bucket from one path to another.
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
|
||||
|
||||
/// Resets the content of everything with the given prefix to the given state
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
||||
@@ -262,14 +272,15 @@ impl std::error::Error for DownloadError {}
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
#[derive(Clone)]
|
||||
pub enum GenericRemoteStorage {
|
||||
// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
|
||||
pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
|
||||
LocalFs(LocalFs),
|
||||
AwsS3(Arc<S3Bucket>),
|
||||
AzureBlob(Arc<AzureBlobStorage>),
|
||||
Unreliable(Arc<UnreliableWrapper>),
|
||||
Unreliable(Other),
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
pub async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -386,6 +397,33 @@ impl GenericRemoteStorage {
|
||||
Self::Unreliable(s) => s.copy(from, to).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AwsS3(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AzureBlob(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
@@ -673,6 +711,7 @@ impl ConcurrencyLimiter {
|
||||
RequestKind::List => &self.read,
|
||||
RequestKind::Delete => &self.write,
|
||||
RequestKind::Copy => &self.write,
|
||||
RequestKind::TimeTravel => &self.write,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//! This storage used in tests, but can also be used in cases when a certain persistent
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
|
||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use bytes::Bytes;
|
||||
@@ -14,7 +14,7 @@ use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
@@ -157,7 +157,6 @@ impl LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -423,6 +422,17 @@ impl RemoteStorage for LocalFs {
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::diverging_sub_expression)]
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
|
||||
@@ -6,12 +6,14 @@
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
task::{Context, Poll},
|
||||
time::SystemTime,
|
||||
};
|
||||
|
||||
use anyhow::Context as _;
|
||||
use anyhow::{anyhow, Context as _};
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
@@ -27,19 +29,19 @@ use aws_sdk_s3::{
|
||||
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
operation::get_object::GetObjectError,
|
||||
types::{Delete, ObjectIdentifier},
|
||||
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
|
||||
Client,
|
||||
};
|
||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
|
||||
use aws_smithy_types::body::SdkBody;
|
||||
use aws_smithy_types::byte_stream::ByteStream;
|
||||
use aws_smithy_types::{body::SdkBody, DateTime};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use futures_util::TryStreamExt;
|
||||
use http_body::Frame;
|
||||
use http_body_util::StreamBody;
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
@@ -272,6 +274,59 @@ impl S3Bucket {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_oids(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
delete_objects: &[ObjectIdentifier],
|
||||
) -> anyhow::Result<()> {
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(
|
||||
Delete::builder()
|
||||
.set_objects(Some(chunk.to_vec()))
|
||||
.build()?,
|
||||
)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
let resp = resp?;
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
// would be too verbose, especially as errors may lead us
|
||||
// to retry repeatedly.
|
||||
const LOG_UP_TO_N_ERRORS: usize = 10;
|
||||
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
|
||||
tracing::warn!(
|
||||
"DeleteObjects key {} failed: {}: {}",
|
||||
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.message.as_ref().map(Cow::from).unwrap_or("".into())
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
@@ -375,7 +430,6 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -471,8 +525,8 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let body = StreamBody::new(from.map_ok(Frame::data));
|
||||
let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
|
||||
let body = Body::wrap_stream(from);
|
||||
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
|
||||
|
||||
let res = self
|
||||
.client
|
||||
@@ -571,64 +625,168 @@ impl RemoteStorage for S3Bucket {
|
||||
delete_objects.push(obj_id);
|
||||
}
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(
|
||||
Delete::builder()
|
||||
.set_objects(Some(chunk.to_vec()))
|
||||
.build()?,
|
||||
)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
match resp {
|
||||
Ok(resp) => {
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
// would be too verbose, especially as errors may lead us
|
||||
// to retry repeatedly.
|
||||
const LOG_UP_TO_N_ERRORS: usize = 10;
|
||||
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
|
||||
tracing::warn!(
|
||||
"DeleteObjects key {} failed: {}: {}",
|
||||
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.message.as_ref().map(Cow::from).unwrap_or("".into())
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
));
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
self.delete_oids(kind, &delete_objects).await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let paths = std::array::from_ref(path);
|
||||
self.delete_objects(paths).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
let timestamp = DateTime::from(timestamp);
|
||||
let done_if_after = DateTime::from(done_if_after);
|
||||
|
||||
tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let prefix = prefix
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
let is_permanent = |_e: &_| false;
|
||||
|
||||
let list = backoff::retry(
|
||||
|| async {
|
||||
Ok(self
|
||||
.client
|
||||
.list_object_versions()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(prefix.clone())
|
||||
.send()
|
||||
.await?)
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"listing object versions for time_travel_recover",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if list.is_truncated().unwrap_or_default() {
|
||||
anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
|
||||
}
|
||||
|
||||
let mut versions_deletes = list
|
||||
.versions()
|
||||
.iter()
|
||||
.map(VerOrDelete::Version)
|
||||
.chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
|
||||
|
||||
let mut vds_for_key = HashMap::<_, Vec<_>>::new();
|
||||
|
||||
for vd in versions_deletes {
|
||||
let last_modified = vd.last_modified();
|
||||
let version_id = vd.version_id();
|
||||
let key = vd.key();
|
||||
let (Some(last_modified), Some(version_id), Some(key)) =
|
||||
(last_modified, version_id, key)
|
||||
else {
|
||||
anyhow::bail!(
|
||||
"One (or more) of last_modified, key, and id is None. \
|
||||
Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
|
||||
last_modified, key, version_id,
|
||||
);
|
||||
};
|
||||
if version_id == "null" {
|
||||
anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
|
||||
indicating either disabled versioning, or legacy objects with null version id values");
|
||||
}
|
||||
tracing::trace!(
|
||||
"Parsing version key={key} version_id={version_id} is_delete={}",
|
||||
matches!(vd, VerOrDelete::DeleteMarker(_))
|
||||
);
|
||||
|
||||
vds_for_key
|
||||
.entry(key)
|
||||
.or_default()
|
||||
.push((vd, last_modified, version_id));
|
||||
}
|
||||
for (key, versions) in vds_for_key {
|
||||
let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
|
||||
if last_last_modified > &&done_if_after {
|
||||
tracing::trace!("Key {key} has version later than done_if_after, skipping");
|
||||
continue;
|
||||
}
|
||||
// the version we want to restore to.
|
||||
let version_to_restore_to =
|
||||
match versions.binary_search_by_key(×tamp, |tpl| *tpl.1) {
|
||||
Ok(v) => v,
|
||||
Err(e) => e,
|
||||
};
|
||||
if version_to_restore_to == versions.len() {
|
||||
tracing::trace!("Key {key} has no changes since timestamp, skipping");
|
||||
continue;
|
||||
}
|
||||
let mut do_delete = false;
|
||||
if version_to_restore_to == 0 {
|
||||
// All versions more recent, so the key didn't exist at the specified time point.
|
||||
tracing::trace!(
|
||||
"All {} versions more recent for {key}, deleting",
|
||||
versions.len()
|
||||
);
|
||||
do_delete = true;
|
||||
} else {
|
||||
match &versions[version_to_restore_to - 1] {
|
||||
(VerOrDelete::Version(_), _last_modified, version_id) => {
|
||||
tracing::trace!("Copying old version {version_id} for {key}...");
|
||||
// Restore the state to the last version by copying
|
||||
let source_id =
|
||||
format!("{}/{key}?versionId={version_id}", self.bucket_name);
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
Ok(self
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(key)
|
||||
.copy_source(&source_id)
|
||||
.send()
|
||||
.await?)
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"listing object versions for time_travel_recover",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
(VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
|
||||
do_delete = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
if do_delete {
|
||||
if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
|
||||
// Key has since been deleted (but there was some history), no need to do anything
|
||||
tracing::trace!("Key {key} already deleted, skipping.");
|
||||
} else {
|
||||
tracing::trace!("Deleting {key}...");
|
||||
|
||||
let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
|
||||
self.delete_oids(kind, &[oid]).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
|
||||
@@ -653,6 +811,32 @@ fn start_measuring_requests(
|
||||
})
|
||||
}
|
||||
|
||||
enum VerOrDelete<'a> {
|
||||
Version(&'a ObjectVersion),
|
||||
DeleteMarker(&'a DeleteMarkerEntry),
|
||||
}
|
||||
|
||||
impl<'a> VerOrDelete<'a> {
|
||||
fn last_modified(&self) -> Option<&'a DateTime> {
|
||||
match self {
|
||||
VerOrDelete::Version(v) => v.last_modified(),
|
||||
VerOrDelete::DeleteMarker(v) => v.last_modified(),
|
||||
}
|
||||
}
|
||||
fn version_id(&self) -> Option<&'a str> {
|
||||
match self {
|
||||
VerOrDelete::Version(v) => v.version_id(),
|
||||
VerOrDelete::DeleteMarker(v) => v.version_id(),
|
||||
}
|
||||
}
|
||||
fn key(&self) -> Option<&'a str> {
|
||||
match self {
|
||||
VerOrDelete::Version(v) => v.key(),
|
||||
VerOrDelete::DeleteMarker(v) => v.key(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use camino::Utf8Path;
|
||||
|
||||
@@ -12,6 +12,7 @@ pub(crate) enum RequestKind {
|
||||
Delete = 2,
|
||||
List = 3,
|
||||
Copy = 4,
|
||||
TimeTravel = 5,
|
||||
}
|
||||
|
||||
use RequestKind::*;
|
||||
@@ -24,6 +25,7 @@ impl RequestKind {
|
||||
Delete => "delete_object",
|
||||
List => "list_objects",
|
||||
Copy => "copy_object",
|
||||
TimeTravel => "time_travel_recover",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
@@ -31,7 +33,7 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct RequestTyped<C>([C; 5]);
|
||||
pub(super) struct RequestTyped<C>([C; 6]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||
@@ -40,8 +42,8 @@ impl<C> RequestTyped<C> {
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List, Copy].into_iter();
|
||||
let arr = std::array::from_fn::<C, 5, _>(|index| {
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
|
||||
let arr = std::array::from_fn::<C, 6, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
|
||||
@@ -3,16 +3,19 @@
|
||||
//! testing purposes.
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
use std::time::SystemTime;
|
||||
use std::{collections::hash_map::Entry, sync::Arc};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
StorageMetadata,
|
||||
};
|
||||
|
||||
pub struct UnreliableWrapper {
|
||||
inner: crate::GenericRemoteStorage,
|
||||
inner: GenericRemoteStorage<Arc<VoidStorage>>,
|
||||
|
||||
// This many attempts of each operation will fail, then we let it succeed.
|
||||
attempts_to_fail: u64,
|
||||
@@ -29,11 +32,21 @@ enum RemoteOp {
|
||||
Download(RemotePath),
|
||||
Delete(RemotePath),
|
||||
DeleteObjects(Vec<RemotePath>),
|
||||
TimeTravelRecover(Option<RemotePath>),
|
||||
}
|
||||
|
||||
impl UnreliableWrapper {
|
||||
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
|
||||
assert!(attempts_to_fail > 0);
|
||||
let inner = match inner {
|
||||
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
|
||||
GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
|
||||
GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
|
||||
// We could also make this a no-op, as in, extract the inner of the passed generic remote storage
|
||||
GenericRemoteStorage::Unreliable(_s) => {
|
||||
panic!("Can't wrap unreliable wrapper unreliably")
|
||||
}
|
||||
};
|
||||
UnreliableWrapper {
|
||||
inner,
|
||||
attempts_to_fail,
|
||||
@@ -84,7 +97,9 @@ impl UnreliableWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
|
||||
type VoidStorage = crate::LocalFs;
|
||||
|
||||
impl RemoteStorage for UnreliableWrapper {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
@@ -169,4 +184,17 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||
self.inner.copy_object(from, to).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
|
||||
self.inner
|
||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,21 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::fmt::{Debug, Display};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
use std::time::{Duration, UNIX_EPOCH};
|
||||
use std::{collections::HashSet, time::SystemTime};
|
||||
|
||||
use crate::common::{download_to_vec, upload_stream};
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use futures_util::Future;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
|
||||
mod common;
|
||||
@@ -18,11 +24,160 @@ mod common;
|
||||
mod tests_s3;
|
||||
|
||||
use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
|
||||
use utils::backoff;
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
const BASE_PREFIX: &str = "test";
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorage::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
||||
};
|
||||
// Our test depends on discrepancies in the clock between S3 and the environment the tests
|
||||
// run in. Therefore, wait a little bit before and after. The alternative would be
|
||||
// to take the time from S3 response headers.
|
||||
const WAIT_TIME: Duration = Duration::from_millis(3_000);
|
||||
|
||||
async fn retry<T, O, F, E>(op: O) -> Result<T, E>
|
||||
where
|
||||
E: Display + Debug + 'static,
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
{
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
backoff::retry(
|
||||
op,
|
||||
|_e| false,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"test retry",
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn time_point() -> SystemTime {
|
||||
tokio::time::sleep(WAIT_TIME).await;
|
||||
let ret = SystemTime::now();
|
||||
tokio::time::sleep(WAIT_TIME).await;
|
||||
ret
|
||||
}
|
||||
|
||||
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(retry(|| client.list_files(None))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>())
|
||||
}
|
||||
|
||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let t0_files = list_files(&ctx.client).await?;
|
||||
let t0 = time_point().await;
|
||||
println!("at t0: {t0_files:?}");
|
||||
|
||||
let old_data = "remote blob data2";
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream(old_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let t1_files = list_files(&ctx.client).await?;
|
||||
let t1 = time_point().await;
|
||||
println!("at t1: {t1_files:?}");
|
||||
|
||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||
{
|
||||
let dl = retry(|| ctx.client.download(&path2)).await?;
|
||||
let last_modified = dl.last_modified.unwrap();
|
||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||
let t0_hwt = t0 + half_wt;
|
||||
let t1_hwt = t1 - half_wt;
|
||||
if !(t0_hwt..=t1_hwt).contains(&last_modified) {
|
||||
panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
|
||||
This likely means a large lock discrepancy between S3 and the local clock.");
|
||||
}
|
||||
}
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let new_data = "new remote blob data2";
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream(new_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None)
|
||||
})
|
||||
.await?;
|
||||
|
||||
retry(|| ctx.client.delete(&path1)).await?;
|
||||
let t2_files = list_files(&ctx.client).await?;
|
||||
let t2 = time_point().await;
|
||||
println!("at t2: {t2_files:?}");
|
||||
|
||||
// No changes after recovery to t2 (no-op)
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t2, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t2_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t2: {t2_files_recovered:?}");
|
||||
assert_eq!(t2_files, t2_files_recovered);
|
||||
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
assert_eq!(path2_recovered_t2, new_data.as_bytes());
|
||||
|
||||
// after recovery to t1: path1 is back, path2 has the old content
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t1, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t1_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t1: {t1_files_recovered:?}");
|
||||
assert_eq!(t1_files, t1_files_recovered);
|
||||
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
assert_eq!(path2_recovered_t1, old_data.as_bytes());
|
||||
|
||||
// after recovery to t0: everything is gone except for path1
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t0, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t0_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t0: {t0_files_recovered:?}");
|
||||
assert_eq!(t0_files, t0_files_recovered);
|
||||
|
||||
// cleanup
|
||||
|
||||
let paths = &[path1, path2, path3];
|
||||
retry(|| ctx.client.delete_objects(paths)).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct EnabledS3 {
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
base_prefix: &'static str,
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
//! Tracing wrapper for Hyper HTTP server
|
||||
|
||||
use hyper::body::Body;
|
||||
use hyper::HeaderMap;
|
||||
use hyper::{Request, Response};
|
||||
use hyper::{Body, Request, Response};
|
||||
use std::future::Future;
|
||||
use tracing::Instrument;
|
||||
use tracing_opentelemetry::OpenTelemetrySpanExt;
|
||||
@@ -36,14 +35,14 @@ pub enum OtelName<'a> {
|
||||
/// instrumentation libraries at:
|
||||
/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
|
||||
/// If a Hyper crate appears, consider switching to that.
|
||||
pub async fn tracing_handler<B1: Body, B2: Body, F, R>(
|
||||
req: Request<B1>,
|
||||
pub async fn tracing_handler<F, R>(
|
||||
req: Request<Body>,
|
||||
handler: F,
|
||||
otel_name: OtelName<'_>,
|
||||
) -> Response<B2>
|
||||
) -> Response<Body>
|
||||
where
|
||||
F: Fn(Request<B1>) -> R,
|
||||
R: Future<Output = Response<B2>>,
|
||||
F: Fn(Request<Body>) -> R,
|
||||
R: Future<Output = Response<Body>>,
|
||||
{
|
||||
// Create a tracing span, with context propagated from the incoming
|
||||
// request if any.
|
||||
|
||||
@@ -22,7 +22,6 @@ chrono.workspace = true
|
||||
heapless.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
http-body-util = { workspace = true, features = [] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
jsonwebtoken.workspace = true
|
||||
|
||||
@@ -4,10 +4,7 @@ use crate::http::{
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
};
|
||||
use bytes::Bytes;
|
||||
use http_body_util::Full;
|
||||
use hyper::{Request, Response, StatusCode};
|
||||
use routerify::Body;
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -154,7 +151,7 @@ pub struct FailpointConfig {
|
||||
pub async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Full<Bytes>>, ApiError> {
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot manage failpoints because storage was compiled without failpoints support"
|
||||
|
||||
@@ -4,11 +4,11 @@ use anyhow::Context;
|
||||
use hyper::header::{HeaderName, AUTHORIZATION};
|
||||
use hyper::http::HeaderValue;
|
||||
use hyper::Method;
|
||||
use hyper::{header::CONTENT_TYPE, Request, Response};
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response};
|
||||
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Body, Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
@@ -238,7 +238,7 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
|
||||
let body = Body::from_stream(ReceiverStream::new(rx));
|
||||
let body = Body::wrap_stream(ReceiverStream::new(rx));
|
||||
|
||||
let mut writer = ChannelWriter::new(128 * 1024, tx);
|
||||
|
||||
@@ -284,7 +284,7 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub fn add_request_id_middleware<B: hyper::body::Body + Send + Sync + 'static>(
|
||||
pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::pre(move |req| async move {
|
||||
let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) {
|
||||
@@ -317,7 +317,7 @@ async fn add_request_id_header_to_response(
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn make_router() -> RouterBuilder<routerify::Body, ApiError> {
|
||||
pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||
Router::builder()
|
||||
.middleware(add_request_id_middleware())
|
||||
.middleware(Middleware::post_with_info(
|
||||
@@ -328,11 +328,11 @@ pub fn make_router() -> RouterBuilder<routerify::Body, ApiError> {
|
||||
}
|
||||
|
||||
pub fn attach_openapi_ui(
|
||||
router_builder: RouterBuilder<routerify::Body, ApiError>,
|
||||
router_builder: RouterBuilder<hyper::Body, ApiError>,
|
||||
spec: &'static [u8],
|
||||
spec_mount_path: &'static str,
|
||||
ui_mount_path: &'static str,
|
||||
) -> RouterBuilder<routerify::Body, ApiError> {
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
router_builder
|
||||
.get(spec_mount_path,
|
||||
move |r| request_span(r, move |_| async move {
|
||||
@@ -388,7 +388,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
|
||||
Ok(token)
|
||||
}
|
||||
|
||||
pub fn auth_middleware<B: hyper::body::Body + Send + Sync + 'static>(
|
||||
pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::pre(move |req| async move {
|
||||
@@ -423,7 +423,7 @@ pub fn add_response_header_middleware<B>(
|
||||
value: &str,
|
||||
) -> anyhow::Result<Middleware<B, ApiError>>
|
||||
where
|
||||
B: hyper::body::Body + Send + Sync + 'static,
|
||||
B: hyper::body::HttpBody + Send + Sync + 'static,
|
||||
{
|
||||
let name =
|
||||
HeaderName::from_str(header).with_context(|| format!("invalid header name: {header}"))?;
|
||||
@@ -464,6 +464,7 @@ pub fn check_permission_with(
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use futures::future::poll_fn;
|
||||
use hyper::service::Service;
|
||||
use routerify::RequestServiceBuilder;
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
@@ -472,13 +473,16 @@ mod tests {
|
||||
async fn test_request_id_returned() {
|
||||
let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
|
||||
let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
|
||||
let service = builder.build(remote_addr);
|
||||
let mut service = builder.build(remote_addr);
|
||||
if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
|
||||
panic!("request service is not ready: {:?}", e);
|
||||
}
|
||||
|
||||
let mut req: Request<Body> = Request::default();
|
||||
req.headers_mut()
|
||||
.append(&X_REQUEST_ID_HEADER, HeaderValue::from_str("42").unwrap());
|
||||
|
||||
let resp: Response<Body> = service.call(req).await.unwrap();
|
||||
let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
|
||||
|
||||
let header_val = resp.headers().get(&X_REQUEST_ID_HEADER).unwrap();
|
||||
|
||||
@@ -489,10 +493,13 @@ mod tests {
|
||||
async fn test_request_id_empty() {
|
||||
let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
|
||||
let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
|
||||
let service = builder.build(remote_addr);
|
||||
let mut service = builder.build(remote_addr);
|
||||
if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
|
||||
panic!("request service is not ready: {:?}", e);
|
||||
}
|
||||
|
||||
let req: Request<Body> = Request::default();
|
||||
let resp: Response<Body> = service.call(req).await.unwrap();
|
||||
let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
|
||||
|
||||
let header_val = resp.headers().get(&X_REQUEST_ID_HEADER);
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use hyper::{header, Response, StatusCode};
|
||||
use routerify::Body;
|
||||
use hyper::{header, Body, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::borrow::Cow;
|
||||
use std::error::Error as StdError;
|
||||
@@ -132,7 +131,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
|
||||
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
|
||||
_ => error!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
|
||||
ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
|
||||
_ => info!("Error processing HTTP request: {api_error:#}"),
|
||||
}
|
||||
|
||||
api_error.into_response()
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::{Buf, Bytes};
|
||||
use http_body_util::{BodyExt, Full};
|
||||
use hyper::{header, Request, Response, StatusCode};
|
||||
use routerify::Body;
|
||||
use anyhow::Context;
|
||||
use bytes::Buf;
|
||||
use hyper::{header, Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::error::ApiError;
|
||||
@@ -20,14 +18,10 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
||||
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
||||
request: &mut Request<Body>,
|
||||
) -> Result<Option<T>, ApiError> {
|
||||
let body = request
|
||||
.body_mut()
|
||||
.collect()
|
||||
let body = hyper::body::aggregate(request.body_mut())
|
||||
.await
|
||||
.map_err(|e| anyhow!(e))
|
||||
.context("Failed to read request body")
|
||||
.map_err(ApiError::BadRequest)?
|
||||
.aggregate();
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
if body.remaining() == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
@@ -41,24 +35,17 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
pub fn json_response_body<T: Serialize>(
|
||||
status: StatusCode,
|
||||
data: T,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
json_response(status, data).map(|r| r.map(Body::new))
|
||||
}
|
||||
|
||||
pub fn json_response<T: Serialize>(
|
||||
status: StatusCode,
|
||||
data: T,
|
||||
) -> Result<Response<Full<Bytes>>, ApiError> {
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let json = serde_json::to_string(&data)
|
||||
.context("Failed to serialize JSON response")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let response = Response::builder()
|
||||
.status(status)
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Full::from(json))
|
||||
.body(Body::from(json))
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
@@ -5,4 +5,4 @@ pub mod request;
|
||||
|
||||
/// Current fast way to apply simple http routing in various Neon binaries.
|
||||
/// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
|
||||
pub use routerify::{ext::RequestExt, Body, RequestServiceBuilder, RouterBuilder};
|
||||
pub use routerify::{ext::RequestExt, RouterBuilder, RouterService};
|
||||
|
||||
@@ -3,9 +3,8 @@ use std::{borrow::Cow, str::FromStr};
|
||||
|
||||
use super::error::ApiError;
|
||||
use anyhow::anyhow;
|
||||
use http_body_util::BodyExt;
|
||||
use hyper::Request;
|
||||
use routerify::{ext::RequestExt, Body};
|
||||
use hyper::{body::HttpBody, Body, Request};
|
||||
use routerify::ext::RequestExt;
|
||||
|
||||
pub fn get_request_param<'a>(
|
||||
request: &'a Request<Body>,
|
||||
@@ -76,7 +75,7 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
|
||||
}
|
||||
|
||||
pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
|
||||
match request.body_mut().frame().await {
|
||||
match request.body_mut().data().await {
|
||||
Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
|
||||
None => Ok(()),
|
||||
}
|
||||
|
||||
@@ -5,10 +5,10 @@ use std::os::unix::io::RawFd;
|
||||
pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
|
||||
let bits = fcntl(fd, F_GETFL)?;
|
||||
|
||||
// Safety: If F_GETFL returns some unknown bits, they should be valid
|
||||
// If F_GETFL returns some unknown bits, they should be valid
|
||||
// for passing back to F_SETFL, too. If we left them out, the F_SETFL
|
||||
// would effectively clear them, which is not what we want.
|
||||
let mut flags = unsafe { OFlag::from_bits_unchecked(bits) };
|
||||
let mut flags = OFlag::from_bits_retain(bits);
|
||||
flags |= OFlag::O_NONBLOCK;
|
||||
|
||||
fcntl(fd, F_SETFL(flags))?;
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::{
|
||||
io,
|
||||
net::{TcpListener, ToSocketAddrs},
|
||||
os::unix::prelude::AsRawFd,
|
||||
};
|
||||
|
||||
use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
|
||||
@@ -10,7 +9,7 @@ use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
|
||||
pub fn bind<A: ToSocketAddrs>(addr: A) -> io::Result<TcpListener> {
|
||||
let listener = TcpListener::bind(addr)?;
|
||||
|
||||
setsockopt(listener.as_raw_fd(), ReuseAddr, &true)?;
|
||||
setsockopt(&listener, ReuseAddr, &true)?;
|
||||
|
||||
Ok(listener)
|
||||
}
|
||||
|
||||
@@ -61,6 +61,7 @@ sync_wrapper.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-epoll-uring.workspace = true
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
|
||||
@@ -108,9 +108,32 @@ pub struct RelTagBlockNo {
|
||||
}
|
||||
|
||||
impl PagestreamClient {
|
||||
pub async fn shutdown(mut self) {
|
||||
let _ = self.cancel_on_client_drop.take();
|
||||
self.conn_task.await.unwrap();
|
||||
pub async fn shutdown(self) {
|
||||
let Self {
|
||||
copy_both,
|
||||
cancel_on_client_drop: cancel_conn_task,
|
||||
conn_task,
|
||||
} = self;
|
||||
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
|
||||
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
|
||||
//
|
||||
// If we drop(copy_both) first, but then immediately drop the `cancel_on_client_drop`,
|
||||
// the CopyFail mesage only makes it to the socket sometimes (i.e., it's a race).
|
||||
//
|
||||
// Further, the pageserver makes a lot of noise when it receives CopyFail.
|
||||
// Computes don't send it in practice, they just hard-close the connection.
|
||||
//
|
||||
// So, let's behave like the computes and suppress the CopyFail as follows:
|
||||
// kill the socket first, then drop copy_both.
|
||||
//
|
||||
// See also: https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-COPY
|
||||
//
|
||||
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
|
||||
// => https://github.com/neondatabase/neon/issues/6390
|
||||
let _ = cancel_conn_task.unwrap();
|
||||
conn_task.await.unwrap();
|
||||
drop(copy_both);
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
|
||||
@@ -18,7 +18,7 @@ use pageserver::tenant::block_io::FileBlockReader;
|
||||
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
|
||||
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
|
||||
use pageserver::tenant::storage_layer::range_overlaps;
|
||||
use pageserver::virtual_file::VirtualFile;
|
||||
use pageserver::virtual_file::{self, VirtualFile};
|
||||
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
||||
pageserver::virtual_file::init(10);
|
||||
pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let mut total_delta_layers = 0usize;
|
||||
|
||||
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
virtual_file::init(10);
|
||||
virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
|
||||
page_cache::init(100);
|
||||
let file = FileBlockReader::new(VirtualFile::open(path).await?);
|
||||
let summary_blk = file.read_blk(0, ctx).await?;
|
||||
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
} => {
|
||||
pageserver::virtual_file::init(10);
|
||||
pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
|
||||
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx).await
|
||||
|
||||
@@ -404,23 +404,27 @@ async fn client(
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
while let Some(req) =
|
||||
tokio::select! { work = work.recv() => { work } , _ = cancel.cancelled() => { return; } }
|
||||
{
|
||||
let start = Instant::now();
|
||||
|
||||
let res = tokio::select! {
|
||||
res = client.getpage(req) => { res },
|
||||
_ = cancel.cancelled() => { return; }
|
||||
};
|
||||
res.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
let do_requests = async {
|
||||
start_work_barrier.wait().await;
|
||||
while let Some(req) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
client
|
||||
.getpage(req)
|
||||
.await
|
||||
.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
};
|
||||
tokio::select! {
|
||||
res = do_requests => { res },
|
||||
_ = cancel.cancelled() => {
|
||||
// fallthrough to shutdown
|
||||
}
|
||||
}
|
||||
client.shutdown().await;
|
||||
}
|
||||
|
||||
@@ -35,6 +35,7 @@ fn main() {
|
||||
logging::Output::Stderr,
|
||||
)
|
||||
.unwrap();
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
let args = Args::parse();
|
||||
match args {
|
||||
|
||||
@@ -11,8 +11,9 @@
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{key_to_slru_block, Key};
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
@@ -133,6 +134,87 @@ where
|
||||
ctx: &'a RequestContext,
|
||||
}
|
||||
|
||||
/// A sink that accepts SLRU blocks ordered by key and forwards
|
||||
/// full segments to the archive.
|
||||
struct SlruSegmentsBuilder<'a, 'b, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
ar: &'a mut Builder<&'b mut W>,
|
||||
buf: Vec<u8>,
|
||||
current_segment: Option<(SlruKind, u32)>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
fn new(ar: &'a mut Builder<&'b mut W>) -> Self {
|
||||
Self {
|
||||
ar,
|
||||
buf: Vec::new(),
|
||||
current_segment: None,
|
||||
}
|
||||
}
|
||||
|
||||
async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
|
||||
let (kind, segno, _) = key_to_slru_block(*key)?;
|
||||
|
||||
match kind {
|
||||
SlruKind::Clog => {
|
||||
ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
|
||||
}
|
||||
SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
|
||||
ensure!(block.len() == BLCKSZ as usize);
|
||||
}
|
||||
}
|
||||
|
||||
let segment = (kind, segno);
|
||||
match self.current_segment {
|
||||
None => {
|
||||
self.current_segment = Some(segment);
|
||||
self.buf
|
||||
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
|
||||
}
|
||||
Some(current_seg) if current_seg == segment => {
|
||||
self.buf
|
||||
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
|
||||
}
|
||||
Some(_) => {
|
||||
self.flush().await?;
|
||||
|
||||
self.current_segment = Some(segment);
|
||||
self.buf
|
||||
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> anyhow::Result<()> {
|
||||
let nblocks = self.buf.len() / BLCKSZ as usize;
|
||||
let (kind, segno) = self.current_segment.take().unwrap();
|
||||
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
|
||||
let header = new_tar_header(&segname, self.buf.len() as u64)?;
|
||||
self.ar.append(&header, self.buf.as_slice()).await?;
|
||||
|
||||
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
|
||||
self.buf.clear();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn finish(mut self) -> anyhow::Result<()> {
|
||||
if self.current_segment.is_none() || self.buf.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.flush().await
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W> Basebackup<'a, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -168,20 +250,27 @@ where
|
||||
}
|
||||
|
||||
// Gather non-relational files from object storage pages.
|
||||
for kind in [
|
||||
SlruKind::Clog,
|
||||
SlruKind::MultiXactOffsets,
|
||||
SlruKind::MultiXactMembers,
|
||||
] {
|
||||
for segno in self
|
||||
let slru_partitions = self
|
||||
.timeline
|
||||
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
.partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
|
||||
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
for part in slru_partitions.parts {
|
||||
let blocks = self
|
||||
.timeline
|
||||
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_slru_segment(kind, segno).await?;
|
||||
.get_vectored(&part.ranges, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
slru_builder.add_block(&key, block?).await?;
|
||||
}
|
||||
}
|
||||
|
||||
slru_builder.finish().await?;
|
||||
|
||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
@@ -305,39 +394,6 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Generate SLRU segment files from repository.
|
||||
//
|
||||
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
|
||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||
for blknum in 0..nblocks {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
if slru == SlruKind::Clog {
|
||||
ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
|
||||
} else {
|
||||
ensure!(img.len() == BLCKSZ as usize);
|
||||
}
|
||||
|
||||
slru_buf.extend_from_slice(&img[..BLCKSZ as usize]);
|
||||
}
|
||||
|
||||
let segname = format!("{}/{:>04X}", slru.to_str(), segno);
|
||||
let header = new_tar_header(&segname, slru_buf.len() as u64)?;
|
||||
self.ar.append(&header, slru_buf.as_slice()).await?;
|
||||
|
||||
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Include database/tablespace directories.
|
||||
//
|
||||
|
||||
@@ -130,7 +130,7 @@ fn main() -> anyhow::Result<()> {
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
||||
|
||||
@@ -36,6 +36,7 @@ use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::virtual_file;
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
@@ -43,6 +44,8 @@ use crate::{
|
||||
|
||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||
|
||||
use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
|
||||
|
||||
pub mod defaults {
|
||||
use crate::tenant::config::defaults::*;
|
||||
use const_format::formatcp;
|
||||
@@ -79,6 +82,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -114,6 +119,8 @@ pub mod defaults {
|
||||
|
||||
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
|
||||
|
||||
#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -247,6 +254,8 @@ pub struct PageServerConf {
|
||||
|
||||
/// Maximum number of WAL records to be ingested and committed at the same time
|
||||
pub ingest_batch_size: u64,
|
||||
|
||||
pub virtual_file_io_engine: virtual_file::IoEngineKind,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -331,6 +340,8 @@ struct PageServerConfigBuilder {
|
||||
secondary_download_concurrency: BuilderValue<usize>,
|
||||
|
||||
ingest_batch_size: BuilderValue<u64>,
|
||||
|
||||
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -406,6 +417,8 @@ impl Default for PageServerConfigBuilder {
|
||||
secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
|
||||
|
||||
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
|
||||
|
||||
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -562,6 +575,10 @@ impl PageServerConfigBuilder {
|
||||
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
|
||||
}
|
||||
|
||||
pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
|
||||
self.virtual_file_io_engine = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
@@ -669,6 +686,9 @@ impl PageServerConfigBuilder {
|
||||
ingest_batch_size: self
|
||||
.ingest_batch_size
|
||||
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
||||
virtual_file_io_engine: self
|
||||
.virtual_file_io_engine
|
||||
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -920,6 +940,9 @@ impl PageServerConf {
|
||||
builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
|
||||
},
|
||||
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
|
||||
"virtual_file_io_engine" => {
|
||||
builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -993,6 +1016,7 @@ impl PageServerConf {
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1225,6 +1249,7 @@ background_task_maximum_delay = '334 s'
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1288,6 +1313,7 @@ background_task_maximum_delay = '334 s'
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: 100,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -386,39 +386,56 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// If we get far enough in the list that we start to evict layers that are below
|
||||
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
||||
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
||||
let mut warned = None;
|
||||
let mut usage_planned = usage_pre;
|
||||
let mut evicted_amount = 0;
|
||||
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
if !usage_planned.has_pressure() {
|
||||
debug!(
|
||||
no_candidates_evicted = i,
|
||||
"took enough candidates for pressure to be relieved"
|
||||
);
|
||||
break;
|
||||
let selection = select_victims(&candidates, usage_pre);
|
||||
|
||||
let mut candidates = candidates;
|
||||
|
||||
let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
|
||||
// we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
|
||||
// for comparison here. this is a temporary measure to develop alternatives.
|
||||
use std::fmt::Write;
|
||||
|
||||
let mut summary_buf = String::with_capacity(256);
|
||||
|
||||
{
|
||||
let absolute_summary = candidates
|
||||
.iter()
|
||||
.take(selection.amount)
|
||||
.map(|(_, candidate)| candidate)
|
||||
.collect::<summary::EvictionSummary>();
|
||||
|
||||
write!(summary_buf, "{absolute_summary}").expect("string grows");
|
||||
|
||||
info!("absolute accessed selection summary: {summary_buf}");
|
||||
}
|
||||
|
||||
if partition == &MinResidentSizePartition::Below && warned.is_none() {
|
||||
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
|
||||
warned = Some(usage_planned);
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
});
|
||||
|
||||
let selection = select_victims(&candidates, usage_pre);
|
||||
|
||||
{
|
||||
summary_buf.clear();
|
||||
|
||||
let relative_summary = candidates
|
||||
.iter()
|
||||
.take(selection.amount)
|
||||
.map(|(_, candidate)| candidate)
|
||||
.collect::<summary::EvictionSummary>();
|
||||
|
||||
write!(summary_buf, "{relative_summary}").expect("string grows");
|
||||
|
||||
info!("relative accessed selection summary: {summary_buf}");
|
||||
}
|
||||
|
||||
usage_planned.add_available_bytes(candidate.layer.get_file_size());
|
||||
evicted_amount += 1;
|
||||
}
|
||||
|
||||
let usage_planned = match warned {
|
||||
Some(respecting_tenant_min_resident_size) => PlannedUsage {
|
||||
respecting_tenant_min_resident_size,
|
||||
fallback_to_global_lru: Some(usage_planned),
|
||||
},
|
||||
None => PlannedUsage {
|
||||
respecting_tenant_min_resident_size: usage_planned,
|
||||
fallback_to_global_lru: None,
|
||||
},
|
||||
selection
|
||||
} else {
|
||||
selection
|
||||
};
|
||||
debug!(?usage_planned, "usage planned");
|
||||
|
||||
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
|
||||
|
||||
// phase2: evict layers
|
||||
|
||||
@@ -910,22 +927,80 @@ async fn collect_eviction_candidates(
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
|
||||
match eviction_order {
|
||||
EvictionOrder::AbsoluteAccessed => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.last_activity_ts)
|
||||
});
|
||||
}
|
||||
EvictionOrder::RelativeAccessed { .. } => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
});
|
||||
}
|
||||
}
|
||||
// always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
|
||||
// will sort later by candidate.relative_last_activity to get compare evictions.
|
||||
candidates
|
||||
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
||||
|
||||
Ok(EvictionCandidates::Finished(candidates))
|
||||
}
|
||||
|
||||
/// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
|
||||
/// relieve pressure.
|
||||
///
|
||||
/// Returns the amount of candidates selected, with the planned usage.
|
||||
fn select_victims<U: Usage>(
|
||||
candidates: &[(MinResidentSizePartition, EvictionCandidate)],
|
||||
usage_pre: U,
|
||||
) -> VictimSelection<U> {
|
||||
let mut usage_when_switched = None;
|
||||
let mut usage_planned = usage_pre;
|
||||
let mut evicted_amount = 0;
|
||||
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
if !usage_planned.has_pressure() {
|
||||
break;
|
||||
}
|
||||
|
||||
if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
|
||||
usage_when_switched = Some((usage_planned, i));
|
||||
}
|
||||
|
||||
usage_planned.add_available_bytes(candidate.layer.get_file_size());
|
||||
evicted_amount += 1;
|
||||
}
|
||||
|
||||
VictimSelection {
|
||||
amount: evicted_amount,
|
||||
usage_pre,
|
||||
usage_when_switched,
|
||||
usage_planned,
|
||||
}
|
||||
}
|
||||
|
||||
struct VictimSelection<U> {
|
||||
amount: usize,
|
||||
usage_pre: U,
|
||||
usage_when_switched: Option<(U, usize)>,
|
||||
usage_planned: U,
|
||||
}
|
||||
|
||||
impl<U: Usage> VictimSelection<U> {
|
||||
fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
|
||||
debug!(
|
||||
evicted_amount=%self.amount,
|
||||
"took enough candidates for pressure to be relieved"
|
||||
);
|
||||
|
||||
if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
|
||||
warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
|
||||
}
|
||||
|
||||
let planned = match self.usage_when_switched {
|
||||
Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
|
||||
respecting_tenant_min_resident_size,
|
||||
fallback_to_global_lru: Some(self.usage_planned),
|
||||
},
|
||||
None => PlannedUsage {
|
||||
respecting_tenant_min_resident_size: self.usage_planned,
|
||||
fallback_to_global_lru: None,
|
||||
},
|
||||
};
|
||||
|
||||
(self.amount, planned)
|
||||
}
|
||||
}
|
||||
|
||||
struct TimelineKey(Arc<Timeline>);
|
||||
|
||||
impl PartialEq for TimelineKey {
|
||||
@@ -1010,6 +1085,137 @@ pub(crate) mod finite_f32 {
|
||||
}
|
||||
}
|
||||
|
||||
mod summary {
|
||||
use super::finite_f32::FiniteF32;
|
||||
use super::{EvictionCandidate, LayerCount};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::time::SystemTime;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub(super) struct EvictionSummary {
|
||||
evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
|
||||
total: LayerCount,
|
||||
|
||||
last_absolute: Option<SystemTime>,
|
||||
last_relative: Option<FiniteF32>,
|
||||
}
|
||||
|
||||
impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
|
||||
fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
|
||||
let mut summary = EvictionSummary::default();
|
||||
for item in iter {
|
||||
let counts = summary
|
||||
.evicted_per_tenant
|
||||
.entry(*item.layer.get_tenant_shard_id())
|
||||
.or_default();
|
||||
|
||||
let sz = item.layer.get_file_size();
|
||||
|
||||
counts.file_sizes += sz;
|
||||
counts.count += 1;
|
||||
|
||||
summary.total.file_sizes += sz;
|
||||
summary.total.count += 1;
|
||||
|
||||
summary.last_absolute = Some(item.last_activity_ts);
|
||||
summary.last_relative = Some(item.relative_last_activity);
|
||||
}
|
||||
|
||||
summary
|
||||
}
|
||||
}
|
||||
|
||||
struct SiBytesAmount(u64);
|
||||
|
||||
impl std::fmt::Display for SiBytesAmount {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.0 < 1024 {
|
||||
return write!(f, "{}B", self.0);
|
||||
}
|
||||
|
||||
let mut tmp = self.0;
|
||||
let mut ch = 0;
|
||||
let suffixes = b"KMGTPE";
|
||||
|
||||
while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
|
||||
tmp /= 1024;
|
||||
ch += 1;
|
||||
}
|
||||
|
||||
let ch = suffixes[ch] as char;
|
||||
|
||||
write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EvictionSummary {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// wasteful, but it's for testing
|
||||
|
||||
let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
|
||||
|
||||
for (tenant_shard_id, count) in &self.evicted_per_tenant {
|
||||
sorted
|
||||
.entry(count.count)
|
||||
.or_default()
|
||||
.push((*tenant_shard_id, count.file_sizes));
|
||||
}
|
||||
|
||||
let total_file_sizes = SiBytesAmount(self.total.file_sizes);
|
||||
|
||||
writeln!(
|
||||
f,
|
||||
"selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
|
||||
self.total.count, self.last_absolute, self.last_relative,
|
||||
)?;
|
||||
|
||||
for (count, per_tenant) in sorted.iter().rev().take(10) {
|
||||
write!(f, "- {count} layers: ")?;
|
||||
|
||||
if per_tenant.len() < 3 {
|
||||
for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
let bytes = SiBytesAmount(*bytes);
|
||||
write!(f, "{tenant_shard_id} ({bytes})")?;
|
||||
}
|
||||
} else {
|
||||
let num_tenants = per_tenant.len();
|
||||
let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
|
||||
let total_bytes = SiBytesAmount(total_bytes);
|
||||
let layers = num_tenants * count;
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{num_tenants} tenants {total_bytes} in total {layers} layers",
|
||||
)?;
|
||||
}
|
||||
|
||||
writeln!(f)?;
|
||||
}
|
||||
|
||||
if sorted.len() > 10 {
|
||||
let (rem_count, rem_bytes) = sorted
|
||||
.iter()
|
||||
.rev()
|
||||
.map(|(count, per_tenant)| {
|
||||
(
|
||||
count,
|
||||
per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
|
||||
)
|
||||
})
|
||||
.fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
|
||||
let rem_bytes = SiBytesAmount(rem_bytes);
|
||||
writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod filesystem_level_usage {
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
|
||||
@@ -877,6 +877,56 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
Marks the initdb archive for preservation upon deletion of the timeline or tenant.
|
||||
This is meant to be part of the disaster recovery process.
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"404":
|
||||
description: No tenant or timeline found for the specified ids
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/synthetic_size:
|
||||
parameters:
|
||||
|
||||
@@ -12,7 +12,7 @@ use futures::TryFutureExt;
|
||||
use humantime::format_rfc3339;
|
||||
use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Request, Response, Uri};
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
@@ -32,7 +32,6 @@ use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
use utils::http::Body;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
@@ -65,7 +64,7 @@ use utils::{
|
||||
http::{
|
||||
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
|
||||
error::{ApiError, HttpErrorBody},
|
||||
json::{json_request, json_response_body as json_response},
|
||||
json::{json_request, json_response},
|
||||
request::parse_request_param,
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
@@ -188,6 +187,7 @@ impl From<TenantSlotUpsertError> for ApiError {
|
||||
match e {
|
||||
InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
|
||||
MapState(e) => e.into(),
|
||||
ShuttingDown(_) => ApiError::ShuttingDown,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -496,6 +496,10 @@ async fn timeline_create_handler(
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
Err(_) if tenant.cancel.is_cancelled() => {
|
||||
// In case we get some ugly error type during shutdown, cast it into a clean 503.
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
|
||||
json_response(StatusCode::CONFLICT, ())
|
||||
}
|
||||
@@ -562,6 +566,43 @@ async fn timeline_list_handler(
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
|
||||
async fn timeline_preserve_initdb_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// Part of the process for disaster recovery from safekeeper-stored WAL:
|
||||
// If we don't recover into a new timeline but want to keep the timeline ID,
|
||||
// then the initdb archive is deleted. This endpoint copies it to a different
|
||||
// location where timeline recreation cand find it.
|
||||
|
||||
async {
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
timeline
|
||||
.preserve_initdb_archive()
|
||||
.await
|
||||
.context("preserving initdb archive")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
Ok::<_, ApiError>(())
|
||||
}
|
||||
.instrument(info_span!("timeline_preserve_initdb_archive",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
%timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_detail_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1221,19 +1262,9 @@ async fn tenant_create_handler(
|
||||
};
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
// is Active when this function returns.
|
||||
if let res @ Err(_) = new_tenant
|
||||
new_tenant
|
||||
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
|
||||
.await
|
||||
{
|
||||
// This shouldn't happen because we just created the tenant directory
|
||||
// in upsert_location, and there aren't any remote timelines
|
||||
// to load, so, nothing can really fail during load.
|
||||
// Don't do cleanup because we don't know how we got here.
|
||||
// The tenant will likely be in `Broken` state and subsequent
|
||||
// calls will fail.
|
||||
res.context("created tenant failed to become active")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
.await?;
|
||||
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
@@ -1572,7 +1603,7 @@ async fn getpage_at_lsn_handler(
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, "application/octet-stream")
|
||||
.body(Body::from(page))
|
||||
.body(hyper::Body::from(page))
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
@@ -1869,7 +1900,7 @@ pub fn make_router(
|
||||
state: Arc<State>,
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
) -> anyhow::Result<RouterBuilder<Body, ApiError>> {
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
||||
if auth.is_some() {
|
||||
@@ -1944,6 +1975,10 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_id/ignore", |r| {
|
||||
api_handler(r, tenant_ignore_handler)
|
||||
})
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|
||||
|r| api_handler(r, timeline_preserve_initdb_handler),
|
||||
)
|
||||
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
||||
api_handler(r, timeline_detail_handler)
|
||||
})
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#![recursion_limit = "300"]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
mod auth;
|
||||
|
||||
@@ -150,6 +150,43 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) struct GetVectoredLatency {
|
||||
map: EnumMap<TaskKind, Option<Histogram>>,
|
||||
}
|
||||
|
||||
impl GetVectoredLatency {
|
||||
// Only these task types perform vectored gets. Filter all other tasks out to reduce total
|
||||
// cardinality of the metric.
|
||||
const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
|
||||
|
||||
pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
|
||||
self.map[task_kind].as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_get_vectored_seconds",
|
||||
"Time spent in get_vectored",
|
||||
&["task_kind"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
GetVectoredLatency {
|
||||
map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
|
||||
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
|
||||
|
||||
if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
|
||||
let task_kind = task_kind.into();
|
||||
Some(inner.with_label_values(&[task_kind]))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})),
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct PageCacheMetricsForTaskKind {
|
||||
pub read_accesses_materialized_page: IntCounter,
|
||||
pub read_accesses_immutable: IntCounter,
|
||||
@@ -932,6 +969,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub(crate) mod virtual_file_descriptor_cache {
|
||||
use super::*;
|
||||
|
||||
@@ -951,6 +989,20 @@ pub(crate) mod virtual_file_descriptor_cache {
|
||||
// ```
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub(crate) mod virtual_file_io_engine {
|
||||
use super::*;
|
||||
|
||||
pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_virtual_file_io_engine_kind",
|
||||
"The configured io engine for VirtualFile",
|
||||
&["kind"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct GlobalAndPerTimelineHistogram {
|
||||
global: Histogram,
|
||||
|
||||
@@ -61,7 +61,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::pgdatadir_mapping::{rel_block_to_key, Version};
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -75,6 +75,7 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
@@ -321,8 +322,8 @@ enum PageStreamError {
|
||||
Shutdown,
|
||||
|
||||
/// Something went wrong reading a page: this likely indicates a pageserver bug
|
||||
#[error("Read error: {0}")]
|
||||
Read(PageReconstructError),
|
||||
#[error("Read error")]
|
||||
Read(#[source] PageReconstructError),
|
||||
|
||||
/// Ran out of time waiting for an LSN
|
||||
#[error("LSN timeout: {0}")]
|
||||
@@ -331,11 +332,11 @@ enum PageStreamError {
|
||||
/// The entity required to serve the request (tenant or timeline) is not found,
|
||||
/// or is not found in a suitable state to serve a request.
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(std::borrow::Cow<'static, str>),
|
||||
NotFound(Cow<'static, str>),
|
||||
|
||||
/// Request asked for something that doesn't make sense, like an invalid LSN
|
||||
#[error("Bad request: {0}")]
|
||||
BadRequest(std::borrow::Cow<'static, str>),
|
||||
BadRequest(Cow<'static, str>),
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for PageStreamError {
|
||||
@@ -384,11 +385,23 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Analogous to calling cancelled() on a Timeline's cancellation token: waits for cancellation.
|
||||
/// Future that completes when we need to shut down the connection.
|
||||
///
|
||||
/// We use many Timeline objects, and hold GateGuards on all of them. We must therefore respect
|
||||
/// all of their cancellation tokens.
|
||||
async fn timeline_cancelled(&self) {
|
||||
/// We currently need to shut down when any of the following happens:
|
||||
/// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
|
||||
/// 2. task_mgr requests shutdown of the connection
|
||||
///
|
||||
/// NB on (1): the connection's lifecycle is not actually tied to any of the
|
||||
/// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
|
||||
/// implementation to be responsive to timeline cancellation because
|
||||
/// the connection holds their `GateGuards` open (sored in `shard_timelines`).
|
||||
/// We currently do the easy thing and terminate the connection if any of the
|
||||
/// shard_timelines gets cancelled. But really, we cuold spend more effort
|
||||
/// and simply remove the cancelled timeline from the `shard_timelines`, thereby
|
||||
/// dropping the guard.
|
||||
///
|
||||
/// NB: keep in sync with [`Self::is_connection_cancelled`]
|
||||
async fn await_connection_cancelled(&self) {
|
||||
// A short wait before we expend the cycles to walk our timeline map. This avoids incurring
|
||||
// that cost every time we check for cancellation.
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
@@ -398,20 +411,26 @@ impl PageServerHandler {
|
||||
// immutable &self). So it's fine to evaluate shard_timelines after the sleep, we don't risk
|
||||
// missing any inserts to the map.
|
||||
|
||||
let mut futs = self
|
||||
.shard_timelines
|
||||
.values()
|
||||
.map(|ht| ht.timeline.cancel.cancelled())
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
futs.next().await;
|
||||
let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
|
||||
use futures::future::Either;
|
||||
cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
|
||||
cancellation_sources.extend(
|
||||
self.shard_timelines
|
||||
.values()
|
||||
.map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
|
||||
);
|
||||
FuturesUnordered::from_iter(cancellation_sources)
|
||||
.next()
|
||||
.await;
|
||||
}
|
||||
|
||||
/// Analogous to calling is_cancelled() on a Timeline's cancellation token
|
||||
fn timeline_is_cancelled(&self) -> bool {
|
||||
self.shard_timelines
|
||||
.values()
|
||||
.any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
|
||||
/// Checking variant of [`Self::await_connection_cancelled`].
|
||||
fn is_connection_cancelled(&self) -> bool {
|
||||
task_mgr::is_shutdown_requested()
|
||||
|| self
|
||||
.shard_timelines
|
||||
.values()
|
||||
.any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
|
||||
}
|
||||
|
||||
/// This function always respects cancellation of any timeline in `[Self::shard_timelines]`. Pass in
|
||||
@@ -432,7 +451,7 @@ impl PageServerHandler {
|
||||
flush_r = pgb.flush() => {
|
||||
Ok(flush_r?)
|
||||
},
|
||||
_ = self.timeline_cancelled() => {
|
||||
_ = self.await_connection_cancelled() => {
|
||||
Err(QueryError::Shutdown)
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
@@ -549,7 +568,7 @@ impl PageServerHandler {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = self.timeline_cancelled() => {
|
||||
_ = self.await_connection_cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
return Err(QueryError::Shutdown)
|
||||
@@ -632,7 +651,7 @@ impl PageServerHandler {
|
||||
span.in_scope(|| info!("handler requested reconnect: {reason}"));
|
||||
return Err(QueryError::Reconnect);
|
||||
}
|
||||
Err(e) if self.timeline_is_cancelled() => {
|
||||
Err(e) if self.is_connection_cancelled() => {
|
||||
// This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
|
||||
// shutdown error, this may be buried inside a PageReconstructError::Other for example.
|
||||
//
|
||||
@@ -648,7 +667,10 @@ impl PageServerHandler {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||
let full = utils::error::report_compact_sources(&e);
|
||||
span.in_scope(|| {
|
||||
error!("error reading relation or page version: {full:#}")
|
||||
});
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
|
||||
@@ -13,7 +13,12 @@ use crate::repository::*;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -22,6 +27,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
@@ -528,6 +534,33 @@ impl Timeline {
|
||||
Ok(Default::default())
|
||||
}
|
||||
|
||||
pub(crate) async fn get_slru_keyspace(
|
||||
&self,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<KeySpace, PageReconstructError> {
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
|
||||
for kind in SlruKind::iter() {
|
||||
let mut segments: Vec<u32> = self
|
||||
.list_slru_segments(kind, version, ctx)
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
segments.sort_unstable();
|
||||
|
||||
for seg in segments {
|
||||
let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?;
|
||||
|
||||
accum.add_range(
|
||||
slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(accum.to_keyspace())
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
pub(crate) async fn list_slru_segments(
|
||||
&self,
|
||||
@@ -1535,366 +1568,6 @@ struct SlruSegmentDirectory {
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
|
||||
// Layout of the Key address space
|
||||
//
|
||||
// The Key struct, used to address the underlying key-value store, consists of
|
||||
// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
|
||||
// all the data and metadata keys into those 18 bytes.
|
||||
//
|
||||
// Principles for the mapping:
|
||||
//
|
||||
// - Things that are often accessed or modified together, should be close to
|
||||
// each other in the key space. For example, if a relation is extended by one
|
||||
// block, we create a new key-value pair for the block data, and update the
|
||||
// relation size entry. Because of that, the RelSize key comes after all the
|
||||
// RelBlocks of a relation: the RelSize and the last RelBlock are always next
|
||||
// to each other.
|
||||
//
|
||||
// The key space is divided into four major sections, identified by the first
|
||||
// byte, and the form a hierarchy:
|
||||
//
|
||||
// 00 Relation data and metadata
|
||||
//
|
||||
// DbDir () -> (dbnode, spcnode)
|
||||
// Filenodemap
|
||||
// RelDir -> relnode forknum
|
||||
// RelBlocks
|
||||
// RelSize
|
||||
//
|
||||
// 01 SLRUs
|
||||
//
|
||||
// SlruDir kind
|
||||
// SlruSegBlocks segno
|
||||
// SlruSegSize
|
||||
//
|
||||
// 02 pg_twophase
|
||||
//
|
||||
// 03 misc
|
||||
// Controlfile
|
||||
// checkpoint
|
||||
// pg_version
|
||||
//
|
||||
// 04 aux files
|
||||
//
|
||||
// Below is a full list of the keyspace allocation:
|
||||
//
|
||||
// DbDir:
|
||||
// 00 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Filenodemap:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000000
|
||||
//
|
||||
// RelDir:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
|
||||
//
|
||||
// RelBlock:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK BLKNUM
|
||||
//
|
||||
// RelSize:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
|
||||
//
|
||||
// SlruDir:
|
||||
// 01 kind 00000000 00000000 00 00000000
|
||||
//
|
||||
// SlruSegBlock:
|
||||
// 01 kind 00000001 SEGNO 00 BLKNUM
|
||||
//
|
||||
// SlruSegSize:
|
||||
// 01 kind 00000001 SEGNO 00 FFFFFFFF
|
||||
//
|
||||
// TwoPhaseDir:
|
||||
// 02 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// TwoPhaseFile:
|
||||
// 02 00000000 00000000 00000000 00 XID
|
||||
//
|
||||
// ControlFile:
|
||||
// 03 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Checkpoint:
|
||||
// 03 00000000 00000000 00000000 00 00000001
|
||||
//
|
||||
// AuxFiles:
|
||||
// 03 00000000 00000000 00000000 00 00000002
|
||||
//
|
||||
|
||||
//-- Section 01: relation data and metadata
|
||||
|
||||
const DBDIR_KEY: Key = Key {
|
||||
field1: 0x00,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0xffffffff,
|
||||
field5: 0xff,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 1,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
|
||||
fn rel_size_to_key(rel: RelTag) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
fn rel_key_range(rel: RelTag) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum + 1,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 02: SLRUs
|
||||
|
||||
fn slru_dir_to_key(kind: SlruKind) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
|
||||
fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
|
||||
let field2 = match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
};
|
||||
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 1,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 03: pg_twophase
|
||||
|
||||
const TWOPHASEDIR_KEY: Key = Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
fn twophase_file_key(xid: TransactionId) -> Key {
|
||||
Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: xid,
|
||||
}
|
||||
}
|
||||
|
||||
fn twophase_key_range(xid: TransactionId) -> Range<Key> {
|
||||
let (next_xid, overflowed) = xid.overflowing_add(1);
|
||||
|
||||
Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: xid,
|
||||
}..Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: u8::from(overflowed),
|
||||
field6: next_xid,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 03: Control file
|
||||
const CONTROLFILE_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
const CHECKPOINT_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 1,
|
||||
};
|
||||
|
||||
const AUX_FILES_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 2,
|
||||
};
|
||||
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
pub fn is_inherited_key(key: Key) -> bool {
|
||||
key != AUX_FILES_KEY
|
||||
}
|
||||
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
pub fn is_rel_vm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00
|
||||
&& key.field4 != 0
|
||||
&& key.field5 == VISIBILITYMAP_FORKNUM
|
||||
&& key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x01 => {
|
||||
let kind = match key.field2 {
|
||||
0x00 => SlruKind::Clog,
|
||||
0x01 => SlruKind::MultiXactMembers,
|
||||
0x02 => SlruKind::MultiXactOffsets,
|
||||
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
|
||||
};
|
||||
let segno = key.field4;
|
||||
let blknum = key.field6;
|
||||
|
||||
(kind, segno, blknum)
|
||||
}
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
fn is_slru_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x01 // SLRU-related
|
||||
&& key.field3 == 0x00000001 // but not SlruDir
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -91,7 +91,6 @@ use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::ops::Bound::Included;
|
||||
use std::process::Stdio;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
@@ -628,9 +627,15 @@ impl Tenant {
|
||||
deletion_queue_client,
|
||||
));
|
||||
|
||||
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
|
||||
// we shut down while attaching.
|
||||
let Ok(attach_gate_guard) = tenant.gate.enter() else {
|
||||
// We just created the Tenant: nothing else can have shut it down yet
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
// Do all the hard work in the background
|
||||
let tenant_clone = Arc::clone(&tenant);
|
||||
|
||||
let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
@@ -640,6 +645,8 @@ impl Tenant {
|
||||
"attach tenant",
|
||||
false,
|
||||
async move {
|
||||
let _gate_guard = attach_gate_guard;
|
||||
|
||||
// Is this tenant being spawned as part of process startup?
|
||||
let starting_up = init_order.is_some();
|
||||
scopeguard::defer! {
|
||||
@@ -716,6 +723,10 @@ impl Tenant {
|
||||
// stayed in Activating for such a long time that shutdown found it in
|
||||
// that state.
|
||||
tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
|
||||
// Make the tenant broken so that set_stopping will not hang waiting for it to leave
|
||||
// the Attaching state. This is an over-reaction (nothing really broke, the tenant is
|
||||
// just shutting down), but ensures progress.
|
||||
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
|
||||
return Ok(());
|
||||
},
|
||||
)
|
||||
@@ -810,7 +821,7 @@ impl Tenant {
|
||||
SpawnMode::Create => None,
|
||||
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
|
||||
};
|
||||
match tenant_clone.attach(preload, &ctx).await {
|
||||
match tenant_clone.attach(preload, mode, &ctx).await {
|
||||
Ok(()) => {
|
||||
info!("attach finished, activating");
|
||||
if let Some(t)= attach_timer {t.observe_duration();}
|
||||
@@ -897,15 +908,20 @@ impl Tenant {
|
||||
async fn attach(
|
||||
self: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||
|
||||
let preload = match preload {
|
||||
Some(p) => p,
|
||||
None => {
|
||||
let preload = match (preload, mode) {
|
||||
(Some(p), _) => p,
|
||||
(None, SpawnMode::Create) => TenantPreload {
|
||||
deleting: false,
|
||||
timelines: HashMap::new(),
|
||||
},
|
||||
(None, SpawnMode::Normal) => {
|
||||
// Deprecated dev mode: load from local disk state instead of remote storage
|
||||
// https://github.com/neondatabase/neon/issues/5624
|
||||
return self.load_local(ctx).await;
|
||||
@@ -1013,7 +1029,10 @@ impl Tenant {
|
||||
// IndexPart is the source of truth.
|
||||
self.clean_up_timelines(&existent_timelines)?;
|
||||
|
||||
failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);
|
||||
fail::fail_point!("attach-before-activate", |_| {
|
||||
anyhow::bail!("attach-before-activate");
|
||||
});
|
||||
failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel);
|
||||
|
||||
info!("Done");
|
||||
|
||||
@@ -1677,9 +1696,13 @@ impl Tenant {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
if !self.is_active() {
|
||||
return Err(CreateTimelineError::Other(anyhow::anyhow!(
|
||||
"Cannot create timelines on inactive tenant"
|
||||
)));
|
||||
if matches!(self.current_state(), TenantState::Stopping { .. }) {
|
||||
return Err(CreateTimelineError::ShuttingDown);
|
||||
} else {
|
||||
return Err(CreateTimelineError::Other(anyhow::anyhow!(
|
||||
"Cannot create timelines on inactive tenant"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let _gate = self
|
||||
@@ -3755,27 +3778,25 @@ async fn run_initdb(
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
// If the `select!` below doesn't finish the `wait_with_output`,
|
||||
// let the task get `wait()`ed for asynchronously by tokio.
|
||||
// This means there is a slim chance we can go over the INIT_DB_SEMAPHORE.
|
||||
// TODO: fix for this is non-trivial, see
|
||||
// https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021
|
||||
//
|
||||
.kill_on_drop(true)
|
||||
.spawn()?;
|
||||
|
||||
tokio::select! {
|
||||
initdb_output = initdb_command.wait_with_output() => {
|
||||
let initdb_output = initdb_output?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
// we can't safely terminate initdb: it launches processes of its own, and killing
|
||||
// initdb doesn't kill them. After we return from this function, we want the target
|
||||
// directory to be able to be cleaned up.
|
||||
// See https://github.com/neondatabase/neon/issues/6385
|
||||
let initdb_output = initdb_command.wait_with_output().await?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(
|
||||
initdb_output.status,
|
||||
initdb_output.stderr,
|
||||
));
|
||||
}
|
||||
|
||||
// This isn't true cancellation support, see above. Still return an error to
|
||||
// excercise the cancellation code path.
|
||||
if cancel.is_cancelled() {
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -4031,7 +4052,7 @@ pub(crate) mod harness {
|
||||
.instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
tenant
|
||||
.attach(Some(preload), ctx)
|
||||
.attach(Some(preload), SpawnMode::Normal, ctx)
|
||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -5,10 +5,10 @@
|
||||
use super::ephemeral_file::EphemeralFile;
|
||||
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
|
||||
use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use bytes::Bytes;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::ops::Deref;
|
||||
|
||||
/// This is implemented by anything that can read 8 kB (PAGE_SZ)
|
||||
/// blocks, using the page cache
|
||||
@@ -39,6 +39,8 @@ pub enum BlockLease<'a> {
|
||||
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
|
||||
#[cfg(test)]
|
||||
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
|
||||
#[cfg(test)]
|
||||
Vec(Vec<u8>),
|
||||
}
|
||||
|
||||
impl From<PageReadGuard<'static>> for BlockLease<'static> {
|
||||
@@ -63,6 +65,10 @@ impl<'a> Deref for BlockLease<'a> {
|
||||
BlockLease::EphemeralFileMutableTail(v) => v,
|
||||
#[cfg(test)]
|
||||
BlockLease::Arc(v) => v.deref(),
|
||||
#[cfg(test)]
|
||||
BlockLease::Vec(v) => {
|
||||
TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -169,10 +175,14 @@ impl FileBlockReader {
|
||||
}
|
||||
|
||||
/// Read a page from the underlying file into given buffer.
|
||||
async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
|
||||
async fn fill_buffer(
|
||||
&self,
|
||||
buf: PageWriteGuard<'static>,
|
||||
blkno: u32,
|
||||
) -> Result<PageWriteGuard<'static>, std::io::Error> {
|
||||
assert!(buf.len() == PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
|
||||
.read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
|
||||
.await
|
||||
}
|
||||
/// Read a block.
|
||||
@@ -196,9 +206,9 @@ impl FileBlockReader {
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
ReadBufResult::NotFound(write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
|
||||
let write_guard = self.fill_buffer(write_guard, blknum).await?;
|
||||
Ok(write_guard.mark_valid().into())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -409,7 +409,10 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant.attach(preload, ctx).await.context("attach")?;
|
||||
tenant
|
||||
.attach(preload, super::SpawnMode::Normal, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
|
||||
@@ -5,11 +5,11 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::cmp::min;
|
||||
use std::fs::OpenOptions;
|
||||
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
@@ -47,7 +47,10 @@ impl EphemeralFile {
|
||||
|
||||
let file = VirtualFile::open_with_options(
|
||||
&filename,
|
||||
OpenOptions::new().read(true).write(true).create(true),
|
||||
virtual_file::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -89,11 +92,10 @@ impl EphemeralFile {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
|
||||
page_cache::ReadBufResult::NotFound(write_guard) => {
|
||||
let write_guard = self
|
||||
.file
|
||||
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
return Ok(BlockLease::PageReadGuard(read_guard));
|
||||
|
||||
@@ -283,15 +283,15 @@ impl LayerMap {
|
||||
///
|
||||
/// This is used for garbage collection, to determine if an old layer can
|
||||
/// be deleted.
|
||||
pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
|
||||
pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> bool {
|
||||
if key.is_empty() {
|
||||
// Vacuously true. There's a newer image for all 0 of the kerys in the range.
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
|
||||
Some(v) => v,
|
||||
None => return Ok(false),
|
||||
None => return false,
|
||||
};
|
||||
|
||||
let start = key.start.to_i128();
|
||||
@@ -304,17 +304,17 @@ impl LayerMap {
|
||||
|
||||
// Check the start is covered
|
||||
if !layer_covers(version.image_coverage.query(start)) {
|
||||
return Ok(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check after all changes of coverage
|
||||
for (_, change_val) in version.image_coverage.range(start..end) {
|
||||
if !layer_covers(change_val) {
|
||||
return Ok(false);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
true
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
|
||||
@@ -325,18 +325,14 @@ impl LayerMap {
|
||||
/// Divide the whole given range of keys into sub-ranges based on the latest
|
||||
/// image layer that covers each range at the specified lsn (inclusive).
|
||||
/// This is used when creating new image layers.
|
||||
///
|
||||
// FIXME: clippy complains that the result type is very complex. She's probably
|
||||
// right...
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub fn image_coverage(
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
|
||||
) -> Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> {
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.0) {
|
||||
Some(v) => v,
|
||||
None => return Ok(vec![]),
|
||||
None => return vec![],
|
||||
};
|
||||
|
||||
let start = key_range.start.to_i128();
|
||||
@@ -359,7 +355,7 @@ impl LayerMap {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(end);
|
||||
coverage.push((kr, current_val.take()));
|
||||
|
||||
Ok(coverage)
|
||||
coverage
|
||||
}
|
||||
|
||||
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
|
||||
@@ -410,24 +406,19 @@ impl LayerMap {
|
||||
/// This number is used to compute the largest number of deltas that
|
||||
/// we'll need to visit for any page reconstruction in this region.
|
||||
/// We use this heuristic to decide whether to create an image layer.
|
||||
pub fn count_deltas(
|
||||
&self,
|
||||
key: &Range<Key>,
|
||||
lsn: &Range<Lsn>,
|
||||
limit: Option<usize>,
|
||||
) -> Result<usize> {
|
||||
pub fn count_deltas(&self, key: &Range<Key>, lsn: &Range<Lsn>, limit: Option<usize>) -> usize {
|
||||
// We get the delta coverage of the region, and for each part of the coverage
|
||||
// we recurse right underneath the delta. The recursion depth is limited by
|
||||
// the largest result this function could return, which is in practice between
|
||||
// 3 and 10 (since we usually try to create an image when the number gets larger).
|
||||
|
||||
if lsn.is_empty() || key.is_empty() || limit == Some(0) {
|
||||
return Ok(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
|
||||
Some(v) => v,
|
||||
None => return Ok(0),
|
||||
None => return 0,
|
||||
};
|
||||
|
||||
let start = key.start.to_i128();
|
||||
@@ -448,8 +439,7 @@ impl LayerMap {
|
||||
if !kr.is_empty() {
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath =
|
||||
self.count_deltas(&kr, &lr, new_limit)?;
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
max_stacked_deltas,
|
||||
base_count + max_stacked_deltas_underneath,
|
||||
@@ -471,7 +461,7 @@ impl LayerMap {
|
||||
if !kr.is_empty() {
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
max_stacked_deltas,
|
||||
base_count + max_stacked_deltas_underneath,
|
||||
@@ -480,7 +470,7 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(max_stacked_deltas)
|
||||
max_stacked_deltas
|
||||
}
|
||||
|
||||
/// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
|
||||
@@ -592,10 +582,7 @@ impl LayerMap {
|
||||
if limit == Some(difficulty) {
|
||||
break;
|
||||
}
|
||||
for (img_range, last_img) in self
|
||||
.image_coverage(range, lsn)
|
||||
.expect("why would this err?")
|
||||
{
|
||||
for (img_range, last_img) in self.image_coverage(range, lsn) {
|
||||
if limit == Some(difficulty) {
|
||||
break;
|
||||
}
|
||||
@@ -606,9 +593,7 @@ impl LayerMap {
|
||||
};
|
||||
|
||||
if img_lsn < lsn {
|
||||
let num_deltas = self
|
||||
.count_deltas(&img_range, &(img_lsn..lsn), limit)
|
||||
.expect("why would this err lol?");
|
||||
let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
|
||||
difficulty = std::cmp::max(difficulty, num_deltas);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
@@ -32,7 +33,8 @@ use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
TenantConfOpt,
|
||||
};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
@@ -466,6 +468,26 @@ pub async fn init_tenant_mgr(
|
||||
// We have a generation map: treat it as the authority for whether
|
||||
// this tenant is really attached.
|
||||
if let Some(gen) = generations.get(&tenant_shard_id) {
|
||||
if let LocationMode::Attached(attached) = &location_conf.mode {
|
||||
if attached.generation > *gen {
|
||||
tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
|
||||
attached.generation
|
||||
);
|
||||
|
||||
// We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
|
||||
// local disk content: demote to secondary rather than detaching.
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf,
|
||||
&SecondaryLocationConfig { warm: false },
|
||||
)),
|
||||
);
|
||||
}
|
||||
}
|
||||
*gen
|
||||
} else {
|
||||
match &location_conf.mode {
|
||||
@@ -721,7 +743,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
tokio::select! {
|
||||
Some(joined) = join_set.join_next() => {
|
||||
match joined {
|
||||
Ok(()) => {}
|
||||
Ok(()) => {},
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the tasks");
|
||||
}
|
||||
@@ -882,7 +904,7 @@ impl TenantManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_location_config: LocationConf,
|
||||
flush: Option<Duration>,
|
||||
spawn_mode: SpawnMode,
|
||||
mut spawn_mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
@@ -902,19 +924,29 @@ impl TenantManager {
|
||||
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
|
||||
match (&new_location_config.mode, peek_slot) {
|
||||
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
|
||||
if attach_conf.generation == tenant.generation {
|
||||
// A transition from Attached to Attached in the same generation, we may
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(new_location_config.clone())
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
);
|
||||
match attach_conf.generation.cmp(&tenant.generation) {
|
||||
Ordering::Equal => {
|
||||
// A transition from Attached to Attached in the same generation, we may
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(new_location_config.clone())
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
);
|
||||
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
} else {
|
||||
// Different generations, fall through to general case
|
||||
None
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
}
|
||||
Ordering::Less => {
|
||||
return Err(UpsertLocationError::BadRequest(anyhow::anyhow!(
|
||||
"Generation {:?} is less than existing {:?}",
|
||||
attach_conf.generation,
|
||||
tenant.generation
|
||||
)));
|
||||
}
|
||||
Ordering::Greater => {
|
||||
// Generation advanced, fall through to general case of replacing `Tenant` object
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
(
|
||||
@@ -1019,6 +1051,12 @@ impl TenantManager {
|
||||
}
|
||||
}
|
||||
slot_guard.drop_old_value().expect("We just shut it down");
|
||||
|
||||
// Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
|
||||
// the caller thinks they're creating but the tenant already existed. We must switch to
|
||||
// Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
|
||||
// rather than assuming it to be empty.
|
||||
spawn_mode = SpawnMode::Normal;
|
||||
}
|
||||
Some(TenantSlot::Secondary(state)) => {
|
||||
info!("Shutting down secondary tenant");
|
||||
@@ -1102,14 +1140,46 @@ impl TenantManager {
|
||||
None
|
||||
};
|
||||
|
||||
slot_guard.upsert(new_slot).map_err(|e| match e {
|
||||
TenantSlotUpsertError::InternalError(e) => {
|
||||
UpsertLocationError::Other(anyhow::anyhow!(e))
|
||||
match slot_guard.upsert(new_slot) {
|
||||
Err(TenantSlotUpsertError::InternalError(e)) => {
|
||||
Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
|
||||
}
|
||||
TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
|
||||
})?;
|
||||
Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
|
||||
Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
|
||||
// If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then
|
||||
// we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants
|
||||
// are shutdown.
|
||||
//
|
||||
// We must shut it down inline here.
|
||||
match new_slot {
|
||||
TenantSlot::InProgress(_) => {
|
||||
// Unreachable because we never insert an InProgress
|
||||
unreachable!()
|
||||
}
|
||||
TenantSlot::Attached(tenant) => {
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {
|
||||
info!("Finished shutting down just-spawned tenant");
|
||||
}
|
||||
Err(barrier) => {
|
||||
info!("Shutdown already in progress, waiting for it to complete");
|
||||
barrier.wait().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
TenantSlot::Secondary(secondary_tenant) => {
|
||||
secondary_tenant.shutdown().await;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(attached_tenant)
|
||||
Err(UpsertLocationError::Unavailable(
|
||||
TenantMapError::ShuttingDown,
|
||||
))
|
||||
}
|
||||
Ok(()) => Ok(attached_tenant),
|
||||
}
|
||||
}
|
||||
|
||||
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
|
||||
@@ -1728,14 +1798,31 @@ pub(crate) enum TenantSlotError {
|
||||
|
||||
/// Superset of TenantMapError: issues that can occur when using a SlotGuard
|
||||
/// to insert a new value.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum TenantSlotUpsertError {
|
||||
#[derive(thiserror::Error)]
|
||||
pub(crate) enum TenantSlotUpsertError {
|
||||
/// An error where the slot is in an unexpected state, indicating a code bug
|
||||
#[error("Internal error updating Tenant")]
|
||||
InternalError(Cow<'static, str>),
|
||||
|
||||
#[error(transparent)]
|
||||
MapState(#[from] TenantMapError),
|
||||
MapState(TenantMapError),
|
||||
|
||||
// If we encounter TenantManager shutdown during upsert, we must carry the Completion
|
||||
// from the SlotGuard, so that the caller can hold it while they clean up: otherwise
|
||||
// TenantManager shutdown might race ahead before we're done cleaning up any Tenant that
|
||||
// was protected by the SlotGuard.
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown((TenantSlot, utils::completion::Completion)),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for TenantSlotUpsertError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::InternalError(reason) => write!(f, "Internal Error {reason}"),
|
||||
Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"),
|
||||
Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1784,7 +1871,7 @@ pub struct SlotGuard {
|
||||
|
||||
/// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will
|
||||
/// release any waiters as soon as this SlotGuard is dropped.
|
||||
_completion: utils::completion::Completion,
|
||||
completion: utils::completion::Completion,
|
||||
}
|
||||
|
||||
impl SlotGuard {
|
||||
@@ -1797,7 +1884,7 @@ impl SlotGuard {
|
||||
tenant_shard_id,
|
||||
old_value,
|
||||
upserted: false,
|
||||
_completion: completion,
|
||||
completion,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1830,9 +1917,16 @@ impl SlotGuard {
|
||||
}
|
||||
|
||||
let m = match &mut *locked {
|
||||
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
|
||||
TenantsMap::Initializing => {
|
||||
return Err(TenantSlotUpsertError::MapState(
|
||||
TenantMapError::StillInitializing,
|
||||
))
|
||||
}
|
||||
TenantsMap::ShuttingDown(_) => {
|
||||
return Err(TenantMapError::ShuttingDown.into());
|
||||
return Err(TenantSlotUpsertError::ShuttingDown((
|
||||
new_value,
|
||||
self.completion.clone(),
|
||||
)));
|
||||
}
|
||||
TenantsMap::Open(m) => m,
|
||||
};
|
||||
@@ -1880,7 +1974,9 @@ impl SlotGuard {
|
||||
Err(TenantSlotUpsertError::InternalError(_)) => {
|
||||
// We already logged the error, nothing else we can do.
|
||||
}
|
||||
Err(TenantSlotUpsertError::MapState(_)) => {
|
||||
Err(
|
||||
TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_),
|
||||
) => {
|
||||
// If the map is shutting down, we need not replace anything
|
||||
}
|
||||
Ok(()) => {}
|
||||
@@ -1978,18 +2074,22 @@ fn tenant_map_peek_slot<'a>(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
mode: TenantSlotPeekMode,
|
||||
) -> Result<Option<&'a TenantSlot>, TenantMapError> {
|
||||
let m = match tenants.deref() {
|
||||
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing),
|
||||
match tenants.deref() {
|
||||
TenantsMap::Initializing => Err(TenantMapError::StillInitializing),
|
||||
TenantsMap::ShuttingDown(m) => match mode {
|
||||
TenantSlotPeekMode::Read => m,
|
||||
TenantSlotPeekMode::Write => {
|
||||
return Err(TenantMapError::ShuttingDown);
|
||||
}
|
||||
TenantSlotPeekMode::Read => Ok(Some(
|
||||
// When reading in ShuttingDown state, we must translate None results
|
||||
// into a ShuttingDown error, because absence of a tenant shard ID in the map
|
||||
// isn't a reliable indicator of the tenant being gone: it might have been
|
||||
// InProgress when shutdown started, and cleaned up from that state such
|
||||
// that it's now no longer in the map. Callers will have to wait until
|
||||
// we next start up to get a proper answer. This avoids incorrect 404 API responses.
|
||||
m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?,
|
||||
)),
|
||||
TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown),
|
||||
},
|
||||
TenantsMap::Open(m) => m,
|
||||
};
|
||||
|
||||
Ok(m.get(tenant_shard_id))
|
||||
TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)),
|
||||
}
|
||||
}
|
||||
|
||||
enum TenantSlotAcquireMode {
|
||||
|
||||
@@ -257,6 +257,8 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
|
||||
|
||||
pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
|
||||
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
@@ -1066,6 +1068,28 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(
|
||||
self: &Arc<Self>,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"preserve_initdb_tar_zst",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
|
||||
)
|
||||
.await
|
||||
.context("backing up initdb archive")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
||||
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
||||
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
||||
@@ -1101,6 +1125,14 @@ impl RemoteTimelineClient {
|
||||
let layer_deletion_count = layers.len();
|
||||
self.deletion_queue_client.push_immediate(layers).await?;
|
||||
|
||||
// Delete the initdb.tar.zst, which is not always present, but deletion attempts of
|
||||
// inexistant objects are not considered errors.
|
||||
let initdb_path =
|
||||
remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
|
||||
self.deletion_queue_client
|
||||
.push_immediate(vec![initdb_path])
|
||||
.await?;
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
||||
@@ -1148,10 +1180,8 @@ impl RemoteTimelineClient {
|
||||
if p == &latest_index {
|
||||
return false;
|
||||
}
|
||||
if let Some(name) = p.object_name() {
|
||||
if name == INITDB_PATH {
|
||||
return false;
|
||||
}
|
||||
if p.object_name() == Some(INITDB_PRESERVED_PATH) {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})
|
||||
@@ -1724,6 +1754,16 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_initdb_preserved_archive_path(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> RemotePath {
|
||||
RemotePath::from_string(&format!(
|
||||
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PRESERVED_PATH}"
|
||||
))
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_index_path(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
|
||||
@@ -32,7 +32,8 @@ use utils::id::TimelineId;
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::{
|
||||
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||
remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
|
||||
INITDB_PATH,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -430,6 +431,9 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
|
||||
let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
|
||||
|
||||
let remote_preserved_path =
|
||||
remote_initdb_preserved_archive_path(&tenant_shard_id.tenant_id, timeline_id);
|
||||
|
||||
let timeline_path = conf.timelines_path(tenant_shard_id);
|
||||
|
||||
if !timeline_path.exists() {
|
||||
@@ -456,8 +460,16 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download =
|
||||
download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
|
||||
let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
|
||||
.await
|
||||
{
|
||||
Ok(dl) => dl,
|
||||
Err(DownloadError::NotFound) => {
|
||||
download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
|
||||
.await?
|
||||
}
|
||||
Err(other) => Err(other)?,
|
||||
};
|
||||
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
|
||||
|
||||
|
||||
@@ -13,8 +13,8 @@ use super::Generation;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
|
||||
upload_cancellable,
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
|
||||
},
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -144,3 +144,16 @@ pub(crate) async fn upload_initdb_dir(
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
|
||||
upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
|
||||
.await
|
||||
.with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
@@ -649,7 +649,7 @@ impl DeltaLayer {
|
||||
{
|
||||
let file = VirtualFile::open_with_options(
|
||||
path,
|
||||
&*std::fs::OpenOptions::new().read(true).write(true),
|
||||
virtual_file::OpenOptions::new().read(true).write(true),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
|
||||
@@ -34,7 +34,7 @@ use crate::tenant::storage_layer::{
|
||||
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
@@ -327,7 +327,7 @@ impl ImageLayer {
|
||||
{
|
||||
let file = VirtualFile::open_with_options(
|
||||
path,
|
||||
&*std::fs::OpenOptions::new().read(true).write(true),
|
||||
virtual_file::OpenOptions::new().read(true).write(true),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
@@ -492,11 +492,15 @@ impl ImageLayerWriterInner {
|
||||
},
|
||||
);
|
||||
info!("new image layer {path}");
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
std::fs::OpenOptions::new().write(true).create_new(true),
|
||||
)
|
||||
.await?;
|
||||
let mut file = {
|
||||
VirtualFile::open_with_options(
|
||||
&path,
|
||||
virtual_file::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true),
|
||||
)
|
||||
.await?
|
||||
};
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
|
||||
|
||||
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||
log_compaction_error(
|
||||
&e,
|
||||
error_run_count,
|
||||
&wait_duration,
|
||||
cancel.is_cancelled(),
|
||||
);
|
||||
wait_duration
|
||||
} else {
|
||||
@@ -210,6 +214,58 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
|
||||
fn log_compaction_error(
|
||||
e: &CompactionError,
|
||||
error_run_count: u32,
|
||||
sleep_duration: &std::time::Duration,
|
||||
task_cancelled: bool,
|
||||
) {
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use CompactionError::*;
|
||||
|
||||
enum LooksLike {
|
||||
Info,
|
||||
Error,
|
||||
}
|
||||
|
||||
let decision = match e {
|
||||
ShuttingDown => None,
|
||||
_ if task_cancelled => Some(LooksLike::Info),
|
||||
Other(e) => {
|
||||
let root_cause = e.root_cause();
|
||||
|
||||
let is_stopping = {
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
|
||||
upload_queue || timeline
|
||||
};
|
||||
|
||||
if is_stopping {
|
||||
Some(LooksLike::Info)
|
||||
} else {
|
||||
Some(LooksLike::Error)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match decision {
|
||||
Some(LooksLike::Info) => info!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
|
||||
),
|
||||
Some(LooksLike::Error) => error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
|
||||
),
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// GC task's main loop
|
||||
///
|
||||
|
||||
@@ -14,6 +14,7 @@ use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
keyspace::{key_range_size, KeySpaceAccum},
|
||||
models::{
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
|
||||
LayerMapInfo, TimelineState,
|
||||
@@ -32,7 +33,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::sync::gate::Gate;
|
||||
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
@@ -73,8 +74,8 @@ use crate::metrics::{
|
||||
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
};
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
|
||||
@@ -391,8 +392,7 @@ pub(crate) enum PageReconstructError {
|
||||
#[error("Ancestor LSN wait error: {0}")]
|
||||
AncestorLsnTimeout(#[from] WaitLsnError),
|
||||
|
||||
/// The operation was cancelled
|
||||
#[error("Cancelled")]
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
/// The ancestor of this is being stopped
|
||||
@@ -404,6 +404,34 @@ pub(crate) enum PageReconstructError {
|
||||
WalRedo(anyhow::Error),
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
/// Returns true if this error indicates a tenant/timeline shutdown alike situation
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use PageReconstructError::*;
|
||||
match self {
|
||||
Other(_) => false,
|
||||
AncestorLsnTimeout(_) => false,
|
||||
Cancelled | AncestorStopping(_) => true,
|
||||
WalRedo(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum CreateImageLayersError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
GetVectoredError(GetVectoredError),
|
||||
|
||||
#[error(transparent)]
|
||||
PageReconstructError(PageReconstructError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum FlushLayerError {
|
||||
/// Timeline cancellation token was cancelled
|
||||
@@ -411,12 +439,24 @@ enum FlushLayerError {
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
PageReconstructError(#[from] PageReconstructError),
|
||||
CreateImageLayersError(CreateImageLayersError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetVectoredError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
#[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
|
||||
Oversized(u64),
|
||||
|
||||
#[error("Requested at invalid LSN: {0}")]
|
||||
InvalidLsn(Lsn),
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum LogicalSizeCalculationCause {
|
||||
Initial,
|
||||
@@ -456,6 +496,45 @@ pub(crate) enum WaitLsnError {
|
||||
Timeout(String),
|
||||
}
|
||||
|
||||
// The impls below achieve cancellation mapping for errors.
|
||||
// Perhaps there's a way of achieving this with less cruft.
|
||||
|
||||
impl From<CreateImageLayersError> for CompactionError {
|
||||
fn from(e: CreateImageLayersError) -> Self {
|
||||
match e {
|
||||
CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
|
||||
_ => CompactionError::Other(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CreateImageLayersError> for FlushLayerError {
|
||||
fn from(e: CreateImageLayersError) -> Self {
|
||||
match e {
|
||||
CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
|
||||
any => FlushLayerError::CreateImageLayersError(any),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for CreateImageLayersError {
|
||||
fn from(e: PageReconstructError) -> Self {
|
||||
match e {
|
||||
PageReconstructError::Cancelled => CreateImageLayersError::Cancelled,
|
||||
_ => CreateImageLayersError::PageReconstructError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetVectoredError> for CreateImageLayersError {
|
||||
fn from(e: GetVectoredError) -> Self {
|
||||
match e {
|
||||
GetVectoredError::Cancelled => CreateImageLayersError::Cancelled,
|
||||
_ => CreateImageLayersError::GetVectoredError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -575,6 +654,57 @@ impl Timeline {
|
||||
res
|
||||
}
|
||||
|
||||
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
|
||||
|
||||
/// Look up multiple page versions at a given LSN
|
||||
///
|
||||
/// This naive implementation will be replaced with a more efficient one
|
||||
/// which actually vectorizes the read path.
|
||||
pub(crate) async fn get_vectored(
|
||||
&self,
|
||||
key_ranges: &[Range<Key>],
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
if !lsn.is_valid() {
|
||||
return Err(GetVectoredError::InvalidLsn(lsn));
|
||||
}
|
||||
|
||||
let key_count = key_ranges
|
||||
.iter()
|
||||
.map(|range| key_range_size(range) as u64)
|
||||
.sum();
|
||||
if key_count > Timeline::MAX_GET_VECTORED_KEYS {
|
||||
return Err(GetVectoredError::Oversized(key_count));
|
||||
}
|
||||
|
||||
let _timer = crate::metrics::GET_VECTORED_LATENCY
|
||||
.for_task_kind(ctx.task_kind())
|
||||
.map(|t| t.start_timer());
|
||||
|
||||
let mut values = BTreeMap::new();
|
||||
for range in key_ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
assert!(!self.shard_identity.is_key_disposable(&key));
|
||||
|
||||
let block = self.get(key, lsn, ctx).await;
|
||||
|
||||
if matches!(
|
||||
block,
|
||||
Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
||||
) {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
}
|
||||
|
||||
values.insert(key, block);
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(values)
|
||||
}
|
||||
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
pub fn get_last_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load().last
|
||||
@@ -2582,7 +2712,7 @@ impl Timeline {
|
||||
return;
|
||||
}
|
||||
err @ Err(
|
||||
FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
|
||||
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
|
||||
) => {
|
||||
error!("could not flush frozen layer: {err:?}");
|
||||
break err;
|
||||
@@ -2859,6 +2989,21 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client
|
||||
.preserve_initdb_archive(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
&self.cancel,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
|
||||
// in layer map immediately. The caller is responsible to put it into the layer map.
|
||||
async fn create_delta_layer(
|
||||
@@ -2950,11 +3095,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
async fn time_for_new_image_layer(
|
||||
&self,
|
||||
partition: &KeySpace,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<bool> {
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
@@ -2974,20 +3115,20 @@ impl Timeline {
|
||||
// but the range is already covered by image layers at more recent LSNs. Before we
|
||||
// create a new image layer, check if the range is already covered at more recent LSNs.
|
||||
if !layers
|
||||
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
|
||||
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
|
||||
{
|
||||
debug!(
|
||||
"Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
|
||||
img_range.start, img_range.end, cutoff_lsn, lsn
|
||||
);
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for part_range in &partition.ranges {
|
||||
let image_coverage = layers.image_coverage(part_range, lsn)?;
|
||||
let image_coverage = layers.image_coverage(part_range, lsn);
|
||||
for (img_range, last_img) in image_coverage {
|
||||
let img_lsn = if let Some(last_img) = last_img {
|
||||
last_img.get_lsn_range().end
|
||||
@@ -3008,7 +3149,7 @@ impl Timeline {
|
||||
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
||||
if img_lsn < lsn {
|
||||
let num_deltas =
|
||||
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
|
||||
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold));
|
||||
|
||||
max_deltas = max_deltas.max(num_deltas);
|
||||
if num_deltas >= threshold {
|
||||
@@ -3016,7 +3157,7 @@ impl Timeline {
|
||||
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||
);
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3026,7 +3167,7 @@ impl Timeline {
|
||||
max_deltas,
|
||||
"none of the partitioned ranges had >= {threshold} deltas"
|
||||
);
|
||||
Ok(false)
|
||||
false
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %force))]
|
||||
@@ -3036,7 +3177,7 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
force: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<ResidentLayer>, PageReconstructError> {
|
||||
) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
let mut image_layers = Vec::new();
|
||||
|
||||
@@ -3054,7 +3195,7 @@ impl Timeline {
|
||||
for partition in partitioning.parts.iter() {
|
||||
let img_range = start..partition.ranges.last().unwrap().end;
|
||||
start = img_range.end;
|
||||
if force || self.time_for_new_image_layer(partition, lsn).await? {
|
||||
if force || self.time_for_new_image_layer(partition, lsn).await {
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
@@ -3065,10 +3206,12 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
"failpoint image-layer-writer-fail-before-finish"
|
||||
)))
|
||||
});
|
||||
|
||||
let mut key_request_accum = KeySpaceAccum::new();
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
@@ -3081,34 +3224,55 @@ impl Timeline {
|
||||
key = key.next();
|
||||
continue;
|
||||
}
|
||||
let img = match self.get(key, lsn, ctx).await {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
|
||||
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
image_layer_writer.put_image(key, &img).await?;
|
||||
key_request_accum.add_key(key);
|
||||
if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| key.next() == range.end
|
||||
{
|
||||
let results = self
|
||||
.get_vectored(
|
||||
&key_request_accum.consume_keyspace().ranges,
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for (img_key, img) in results {
|
||||
let img = match img {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(img_key)
|
||||
|| is_rel_vm_block_key(img_key)
|
||||
{
|
||||
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(
|
||||
CreateImageLayersError::PageReconstructError(err),
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
image_layer_writer.put_image(img_key, &img).await?;
|
||||
}
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
@@ -3484,7 +3648,7 @@ impl Timeline {
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
|
||||
// That is why it is better to measure size of hole as number of covering image layers.
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
|
||||
if coverage_size >= min_hole_coverage_size {
|
||||
heap.push(Hole {
|
||||
key_range,
|
||||
@@ -4110,7 +4274,7 @@ impl Timeline {
|
||||
// we cannot remove C, even though it's older than 2500, because
|
||||
// the delta layer 2000-3000 depends on it.
|
||||
if !layers
|
||||
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
|
||||
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
|
||||
{
|
||||
debug!("keeping {} because it is the latest layer", l.filename());
|
||||
// Collect delta key ranges that need image layers to allow garbage
|
||||
@@ -4240,7 +4404,7 @@ impl Timeline {
|
||||
.walredo_mgr
|
||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||
.await
|
||||
.context("Failed to reconstruct a page image:")
|
||||
.context("reconstruct a page image")
|
||||
{
|
||||
Ok(img) => img,
|
||||
Err(e) => return Err(PageReconstructError::WalRedo(e)),
|
||||
|
||||
@@ -126,6 +126,27 @@ pub(super) struct UploadQueueStopped {
|
||||
pub(super) deleted_at: SetDeletedFlagProgress,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum NotInitialized {
|
||||
#[error("queue is in state Uninitialized")]
|
||||
Uninitialized,
|
||||
#[error("queue is in state Stopping")]
|
||||
Stopped,
|
||||
#[error("queue is shutting down")]
|
||||
ShuttingDown,
|
||||
}
|
||||
|
||||
impl NotInitialized {
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use NotInitialized::*;
|
||||
match self {
|
||||
Uninitialized => false,
|
||||
Stopped => true,
|
||||
ShuttingDown => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UploadQueue {
|
||||
pub(crate) fn initialize_empty_remote(
|
||||
&mut self,
|
||||
@@ -214,17 +235,17 @@ impl UploadQueue {
|
||||
}
|
||||
|
||||
pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
use UploadQueue::*;
|
||||
match self {
|
||||
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
|
||||
anyhow::bail!("queue is in state {}", self.as_str())
|
||||
}
|
||||
UploadQueue::Initialized(x) => {
|
||||
if !x.shutting_down {
|
||||
Ok(x)
|
||||
Uninitialized => Err(NotInitialized::Uninitialized.into()),
|
||||
Initialized(x) => {
|
||||
if x.shutting_down {
|
||||
Err(NotInitialized::ShuttingDown.into())
|
||||
} else {
|
||||
anyhow::bail!("queue is shutting down")
|
||||
Ok(x)
|
||||
}
|
||||
}
|
||||
Stopped(_) => Err(NotInitialized::Stopped.into()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,18 +11,28 @@
|
||||
//! src/backend/storage/file/fd.c
|
||||
//!
|
||||
use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
|
||||
|
||||
use crate::page_cache::PageWriteGuard;
|
||||
use crate::tenant::TENANTS_SEGMENT_NAME;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::fs::{self, File};
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
|
||||
use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tokio::time::Instant;
|
||||
use utils::fs_ext;
|
||||
|
||||
mod io_engine;
|
||||
mod open_options;
|
||||
pub use io_engine::IoEngineKind;
|
||||
pub(crate) use open_options::*;
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
/// the underlying file is closed if the system is low on file descriptors,
|
||||
@@ -106,7 +116,38 @@ struct SlotInner {
|
||||
tag: u64,
|
||||
|
||||
/// the underlying file
|
||||
file: Option<File>,
|
||||
file: Option<OwnedFd>,
|
||||
}
|
||||
|
||||
/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
|
||||
struct PageWriteGuardBuf {
|
||||
page: PageWriteGuard<'static>,
|
||||
init_up_to: usize,
|
||||
}
|
||||
// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
|
||||
// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
|
||||
unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
|
||||
fn stable_ptr(&self) -> *const u8 {
|
||||
self.page.as_ptr()
|
||||
}
|
||||
fn bytes_init(&self) -> usize {
|
||||
self.init_up_to
|
||||
}
|
||||
fn bytes_total(&self) -> usize {
|
||||
self.page.len()
|
||||
}
|
||||
}
|
||||
// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access,
|
||||
// hence it's safe to hand out the `stable_mut_ptr()`.
|
||||
unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
|
||||
fn stable_mut_ptr(&mut self) -> *mut u8 {
|
||||
self.page.as_mut_ptr()
|
||||
}
|
||||
|
||||
unsafe fn set_init(&mut self, pos: usize) {
|
||||
assert!(pos <= self.page.len());
|
||||
self.init_up_to = pos;
|
||||
}
|
||||
}
|
||||
|
||||
impl OpenFiles {
|
||||
@@ -274,6 +315,10 @@ macro_rules! with_file {
|
||||
let $ident = $this.lock_file().await?;
|
||||
observe_duration!($op, $($body)*)
|
||||
}};
|
||||
($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{
|
||||
let mut $ident = $this.lock_file().await?;
|
||||
observe_duration!($op, $($body)*)
|
||||
}};
|
||||
}
|
||||
|
||||
impl VirtualFile {
|
||||
@@ -326,7 +371,9 @@ impl VirtualFile {
|
||||
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
|
||||
// where our caller doesn't get to use the returned VirtualFile before its
|
||||
// slot gets re-used by someone else.
|
||||
let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
|
||||
let file = observe_duration!(StorageIoOperation::Open, {
|
||||
open_options.open(path.as_std_path()).await?
|
||||
});
|
||||
|
||||
// Strip all options other than read and write.
|
||||
//
|
||||
@@ -400,15 +447,13 @@ impl VirtualFile {
|
||||
|
||||
/// Call File::sync_all() on the underlying File.
|
||||
pub async fn sync_all(&self) -> Result<(), Error> {
|
||||
with_file!(self, StorageIoOperation::Fsync, |file| file
|
||||
.as_ref()
|
||||
.sync_all())
|
||||
with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
|
||||
.with_std_file(|std_file| std_file.sync_all()))
|
||||
}
|
||||
|
||||
pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
|
||||
with_file!(self, StorageIoOperation::Metadata, |file| file
|
||||
.as_ref()
|
||||
.metadata())
|
||||
with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
|
||||
.with_std_file(|std_file| std_file.metadata()))
|
||||
}
|
||||
|
||||
/// Helper function internal to `VirtualFile` that looks up the underlying File,
|
||||
@@ -417,7 +462,7 @@ impl VirtualFile {
|
||||
///
|
||||
/// We are doing it via a macro as Rust doesn't support async closures that
|
||||
/// take on parameters with lifetimes.
|
||||
async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
|
||||
async fn lock_file(&self) -> Result<FileGuard, Error> {
|
||||
let open_files = get_open_files();
|
||||
|
||||
let mut handle_guard = {
|
||||
@@ -463,10 +508,9 @@ impl VirtualFile {
|
||||
// NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
|
||||
// case from StorageIoOperation::Open. This helps with identifying thrashing
|
||||
// of the virtual file descriptor cache.
|
||||
let file = observe_duration!(
|
||||
StorageIoOperation::OpenAfterReplace,
|
||||
self.open_options.open(&self.path)
|
||||
)?;
|
||||
let file = observe_duration!(StorageIoOperation::OpenAfterReplace, {
|
||||
self.open_options.open(self.path.as_std_path()).await?
|
||||
});
|
||||
|
||||
// Store the File in the slot and update the handle in the VirtualFile
|
||||
// to point to it.
|
||||
@@ -491,9 +535,8 @@ impl VirtualFile {
|
||||
self.pos = offset;
|
||||
}
|
||||
SeekFrom::End(offset) => {
|
||||
self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
|
||||
.as_ref()
|
||||
.seek(SeekFrom::End(offset)))?
|
||||
self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard
|
||||
.with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))?
|
||||
}
|
||||
SeekFrom::Current(offset) => {
|
||||
let pos = self.pos as i128 + offset as i128;
|
||||
@@ -512,25 +555,28 @@ impl VirtualFile {
|
||||
Ok(self.pos)
|
||||
}
|
||||
|
||||
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
|
||||
pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
|
||||
while !buf.is_empty() {
|
||||
match self.read_at(buf, offset).await {
|
||||
Ok(0) => {
|
||||
return Err(Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
"failed to fill whole buffer",
|
||||
))
|
||||
}
|
||||
Ok(n) => {
|
||||
buf = &mut buf[n..];
|
||||
offset += n as u64;
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
|
||||
where
|
||||
B: IoBufMut + Send,
|
||||
{
|
||||
let (buf, res) =
|
||||
read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
|
||||
res.map(|()| buf)
|
||||
}
|
||||
|
||||
/// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
|
||||
pub async fn read_exact_at_page(
|
||||
&self,
|
||||
page: PageWriteGuard<'static>,
|
||||
offset: u64,
|
||||
) -> Result<PageWriteGuard<'static>, Error> {
|
||||
let buf = PageWriteGuardBuf {
|
||||
page,
|
||||
init_up_to: 0,
|
||||
};
|
||||
let res = self.read_exact_at(buf, offset).await;
|
||||
res.map(|PageWriteGuardBuf { page, .. }| page)
|
||||
.map_err(|e| Error::new(ErrorKind::Other, e))
|
||||
}
|
||||
|
||||
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
|
||||
@@ -580,22 +626,35 @@ impl VirtualFile {
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
|
||||
let result = with_file!(self, StorageIoOperation::Read, |file| file
|
||||
.as_ref()
|
||||
.read_at(buf, offset));
|
||||
if let Ok(size) = result {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
|
||||
.add(size as i64);
|
||||
}
|
||||
result
|
||||
pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
|
||||
where
|
||||
B: tokio_epoll_uring::BoundedBufMut + Send,
|
||||
{
|
||||
let file_guard = match self.lock_file().await {
|
||||
Ok(file_guard) => file_guard,
|
||||
Err(e) => return (buf, Err(e)),
|
||||
};
|
||||
|
||||
observe_duration!(StorageIoOperation::Read, {
|
||||
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
|
||||
if let Ok(size) = res {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&[
|
||||
"read",
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
&self.timeline_id,
|
||||
])
|
||||
.add(size as i64);
|
||||
}
|
||||
(buf, res)
|
||||
})
|
||||
}
|
||||
|
||||
async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
|
||||
let result = with_file!(self, StorageIoOperation::Write, |file| file
|
||||
.as_ref()
|
||||
.write_at(buf, offset));
|
||||
let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
|
||||
file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
|
||||
});
|
||||
if let Ok(size) = result {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
|
||||
@@ -605,18 +664,241 @@ impl VirtualFile {
|
||||
}
|
||||
}
|
||||
|
||||
struct FileGuard<'a> {
|
||||
slot_guard: RwLockReadGuard<'a, SlotInner>,
|
||||
// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
|
||||
pub async fn read_exact_at_impl<B, F, Fut>(
|
||||
buf: B,
|
||||
mut offset: u64,
|
||||
mut read_at: F,
|
||||
) -> (B, std::io::Result<()>)
|
||||
where
|
||||
B: IoBufMut + Send,
|
||||
F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
|
||||
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
|
||||
{
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
|
||||
while buf.bytes_total() != 0 {
|
||||
let res;
|
||||
(buf, res) = read_at(buf, offset).await;
|
||||
match res {
|
||||
Ok(0) => break,
|
||||
Ok(n) => {
|
||||
buf = buf.slice(n..);
|
||||
offset += n as u64;
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return (buf.into_inner(), Err(e)),
|
||||
}
|
||||
}
|
||||
// NB: don't use `buf.is_empty()` here; it is from the
|
||||
// `impl Deref for Slice { Target = [u8] }`; the the &[u8]
|
||||
// returned by it only covers the initialized portion of `buf`.
|
||||
// Whereas we're interested in ensuring that we filled the entire
|
||||
// buffer that the user passed in.
|
||||
if buf.bytes_total() != 0 {
|
||||
(
|
||||
buf.into_inner(),
|
||||
Err(std::io::Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
"failed to fill whole buffer",
|
||||
)),
|
||||
)
|
||||
} else {
|
||||
assert_eq!(buf.len(), buf.bytes_total());
|
||||
(buf.into_inner(), Ok(()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> AsRef<File> for FileGuard<'a> {
|
||||
fn as_ref(&self) -> &File {
|
||||
#[cfg(test)]
|
||||
mod test_read_exact_at_impl {
|
||||
|
||||
use std::{collections::VecDeque, sync::Arc};
|
||||
|
||||
use tokio_epoll_uring::{BoundedBuf, BoundedBufMut};
|
||||
|
||||
use super::read_exact_at_impl;
|
||||
|
||||
struct Expectation {
|
||||
offset: u64,
|
||||
bytes_total: usize,
|
||||
result: std::io::Result<Vec<u8>>,
|
||||
}
|
||||
struct MockReadAt {
|
||||
expectations: VecDeque<Expectation>,
|
||||
}
|
||||
|
||||
impl MockReadAt {
|
||||
async fn read_at(
|
||||
&mut self,
|
||||
mut buf: tokio_epoll_uring::Slice<Vec<u8>>,
|
||||
offset: u64,
|
||||
) -> (tokio_epoll_uring::Slice<Vec<u8>>, std::io::Result<usize>) {
|
||||
let exp = self
|
||||
.expectations
|
||||
.pop_front()
|
||||
.expect("read_at called but we have no expectations left");
|
||||
assert_eq!(exp.offset, offset);
|
||||
assert_eq!(exp.bytes_total, buf.bytes_total());
|
||||
match exp.result {
|
||||
Ok(bytes) => {
|
||||
assert!(bytes.len() <= buf.bytes_total());
|
||||
buf.put_slice(&bytes);
|
||||
(buf, Ok(bytes.len()))
|
||||
}
|
||||
Err(e) => (buf, Err(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for MockReadAt {
|
||||
fn drop(&mut self) {
|
||||
assert_eq!(self.expectations.len(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_basic() {
|
||||
let buf = Vec::with_capacity(5);
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::from(vec![Expectation {
|
||||
offset: 0,
|
||||
bytes_total: 5,
|
||||
result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
|
||||
}]),
|
||||
}));
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
.await;
|
||||
assert!(res.is_ok());
|
||||
assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_empty_buf_issues_no_syscall() {
|
||||
let buf = Vec::new();
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::new(),
|
||||
}));
|
||||
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
.await;
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_two_read_at_calls_needed_until_buf_filled() {
|
||||
let buf = Vec::with_capacity(4);
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::from(vec![
|
||||
Expectation {
|
||||
offset: 0,
|
||||
bytes_total: 4,
|
||||
result: Ok(vec![b'a', b'b']),
|
||||
},
|
||||
Expectation {
|
||||
offset: 2,
|
||||
bytes_total: 2,
|
||||
result: Ok(vec![b'c', b'd']),
|
||||
},
|
||||
]),
|
||||
}));
|
||||
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
.await;
|
||||
assert!(res.is_ok());
|
||||
assert_eq!(buf, vec![b'a', b'b', b'c', b'd']);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_eof_before_buffer_full() {
|
||||
let buf = Vec::with_capacity(3);
|
||||
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
|
||||
expectations: VecDeque::from(vec![
|
||||
Expectation {
|
||||
offset: 0,
|
||||
bytes_total: 3,
|
||||
result: Ok(vec![b'a']),
|
||||
},
|
||||
Expectation {
|
||||
offset: 1,
|
||||
bytes_total: 2,
|
||||
result: Ok(vec![b'b']),
|
||||
},
|
||||
Expectation {
|
||||
offset: 2,
|
||||
bytes_total: 1,
|
||||
result: Ok(vec![]),
|
||||
},
|
||||
]),
|
||||
}));
|
||||
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
|
||||
let mock_read_at = Arc::clone(&mock_read_at);
|
||||
async move { mock_read_at.lock().await.read_at(buf, offset).await }
|
||||
})
|
||||
.await;
|
||||
let Err(err) = res else {
|
||||
panic!("should return an error");
|
||||
};
|
||||
assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof);
|
||||
assert_eq!(format!("{err}"), "failed to fill whole buffer");
|
||||
// buffer contents on error are unspecified
|
||||
}
|
||||
}
|
||||
|
||||
struct FileGuard {
|
||||
slot_guard: RwLockReadGuard<'static, SlotInner>,
|
||||
}
|
||||
|
||||
impl AsRef<OwnedFd> for FileGuard {
|
||||
fn as_ref(&self) -> &OwnedFd {
|
||||
// This unwrap is safe because we only create `FileGuard`s
|
||||
// if we know that the file is Some.
|
||||
self.slot_guard.file.as_ref().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl FileGuard {
|
||||
/// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
|
||||
fn with_std_file<F, R>(&self, with: F) -> R
|
||||
where
|
||||
F: FnOnce(&File) -> R,
|
||||
{
|
||||
// SAFETY:
|
||||
// - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
|
||||
// - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut`
|
||||
let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
|
||||
let res = with(&file);
|
||||
let _ = file.into_raw_fd();
|
||||
res
|
||||
}
|
||||
/// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
|
||||
fn with_std_file_mut<F, R>(&mut self, with: F) -> R
|
||||
where
|
||||
F: FnOnce(&mut File) -> R,
|
||||
{
|
||||
// SAFETY:
|
||||
// - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
|
||||
// - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd
|
||||
let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
|
||||
let res = with(&mut file);
|
||||
let _ = file.into_raw_fd();
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
impl tokio_epoll_uring::IoFd for FileGuard {
|
||||
unsafe fn as_fd(&self) -> RawFd {
|
||||
let owned_fd: &OwnedFd = self.as_ref();
|
||||
owned_fd.as_raw_fd()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl VirtualFile {
|
||||
pub(crate) async fn read_blk(
|
||||
@@ -624,16 +906,19 @@ impl VirtualFile {
|
||||
blknum: u32,
|
||||
) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
let mut buf = [0; PAGE_SZ];
|
||||
self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
|
||||
let buf = vec![0; PAGE_SZ];
|
||||
let buf = self
|
||||
.read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
|
||||
.await?;
|
||||
Ok(std::sync::Arc::new(buf).into())
|
||||
Ok(crate::tenant::block_io::BlockLease::Vec(buf))
|
||||
}
|
||||
|
||||
async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
|
||||
let mut tmp = vec![0; 128];
|
||||
loop {
|
||||
let mut tmp = [0; 128];
|
||||
match self.read_at(&mut tmp, self.pos).await {
|
||||
let res;
|
||||
(tmp, res) = self.read_at(tmp, self.pos).await;
|
||||
match res {
|
||||
Ok(0) => return Ok(()),
|
||||
Ok(n) => {
|
||||
self.pos += n as u64;
|
||||
@@ -709,10 +994,12 @@ impl OpenFiles {
|
||||
/// Initialize the virtual file module. This must be called once at page
|
||||
/// server startup.
|
||||
///
|
||||
pub fn init(num_slots: usize) {
|
||||
#[cfg(not(test))]
|
||||
pub fn init(num_slots: usize, engine: IoEngineKind) {
|
||||
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
|
||||
panic!("virtual_file::init called twice");
|
||||
}
|
||||
io_engine::init(engine);
|
||||
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
|
||||
}
|
||||
|
||||
@@ -757,10 +1044,10 @@ mod tests {
|
||||
}
|
||||
|
||||
impl MaybeVirtualFile {
|
||||
async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
|
||||
async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
|
||||
MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset),
|
||||
MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
|
||||
}
|
||||
}
|
||||
async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
|
||||
@@ -802,14 +1089,14 @@ mod tests {
|
||||
|
||||
// Helper function to slurp a portion of a file into a string
|
||||
async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
|
||||
let mut buf = vec![0; len];
|
||||
self.read_exact_at(&mut buf, pos).await?;
|
||||
let buf = vec![0; len];
|
||||
let buf = self.read_exact_at(buf, pos).await?;
|
||||
Ok(String::from_utf8(buf).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_virtual_files() -> Result<(), Error> {
|
||||
async fn test_virtual_files() -> anyhow::Result<()> {
|
||||
// The real work is done in the test_files() helper function. This
|
||||
// allows us to run the same set of tests against a native File, and
|
||||
// VirtualFile. We trust the native Files and wouldn't need to test them,
|
||||
@@ -825,14 +1112,17 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_physical_files() -> Result<(), Error> {
|
||||
async fn test_physical_files() -> anyhow::Result<()> {
|
||||
test_files("physical_files", |path, open_options| async move {
|
||||
Ok(MaybeVirtualFile::File(open_options.open(path)?))
|
||||
Ok(MaybeVirtualFile::File({
|
||||
let owned_fd = open_options.open(path.as_std_path()).await?;
|
||||
File::from(owned_fd)
|
||||
}))
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
|
||||
async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
|
||||
where
|
||||
OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
|
||||
FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
|
||||
@@ -976,11 +1266,11 @@ mod tests {
|
||||
for _threadno in 0..THREADS {
|
||||
let files = files.clone();
|
||||
let hdl = rt.spawn(async move {
|
||||
let mut buf = [0u8; SIZE];
|
||||
let mut buf = vec![0u8; SIZE];
|
||||
let mut rng = rand::rngs::OsRng;
|
||||
for _ in 1..1000 {
|
||||
let f = &files[rng.gen_range(0..files.len())];
|
||||
f.read_exact_at(&mut buf, 0).await.unwrap();
|
||||
buf = f.read_exact_at(buf, 0).await.unwrap();
|
||||
assert!(buf == SAMPLE);
|
||||
}
|
||||
});
|
||||
|
||||
114
pageserver/src/virtual_file/io_engine.rs
Normal file
114
pageserver/src/virtual_file/io_engine.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
//! [`super::VirtualFile`] supports different IO engines.
|
||||
//!
|
||||
//! The [`IoEngineKind`] enum identifies them.
|
||||
//!
|
||||
//! The choice of IO engine is global.
|
||||
//! Initialize using [`init`].
|
||||
//!
|
||||
//! Then use [`get`] and [`super::OpenOptions`].
|
||||
|
||||
#[derive(
|
||||
Copy,
|
||||
Clone,
|
||||
PartialEq,
|
||||
Eq,
|
||||
Hash,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
Debug,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
pub enum IoEngineKind {
|
||||
StdFs,
|
||||
#[cfg(target_os = "linux")]
|
||||
TokioEpollUring,
|
||||
}
|
||||
|
||||
static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub(super) fn init(engine: IoEngineKind) {
|
||||
if IO_ENGINE.set(engine).is_err() {
|
||||
panic!("called twice");
|
||||
}
|
||||
crate::metrics::virtual_file_io_engine::KIND
|
||||
.with_label_values(&[&format!("{engine}")])
|
||||
.set(1);
|
||||
}
|
||||
|
||||
pub(super) fn get() -> &'static IoEngineKind {
|
||||
#[cfg(test)]
|
||||
{
|
||||
let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
|
||||
IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
|
||||
Ok(v) => match v.parse::<IoEngineKind>() {
|
||||
Ok(engine_kind) => engine_kind,
|
||||
Err(e) => {
|
||||
panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
|
||||
}
|
||||
},
|
||||
Err(std::env::VarError::NotPresent) => {
|
||||
crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
Err(std::env::VarError::NotUnicode(_)) => {
|
||||
panic!("env var {env_var_name} is not unicode");
|
||||
}
|
||||
})
|
||||
}
|
||||
#[cfg(not(test))]
|
||||
IO_ENGINE.get().unwrap()
|
||||
}
|
||||
|
||||
use std::os::unix::prelude::FileExt;
|
||||
|
||||
use super::FileGuard;
|
||||
|
||||
impl IoEngineKind {
|
||||
pub(super) async fn read_at<B>(
|
||||
&self,
|
||||
file_guard: FileGuard,
|
||||
offset: u64,
|
||||
mut buf: B,
|
||||
) -> ((FileGuard, B), std::io::Result<usize>)
|
||||
where
|
||||
B: tokio_epoll_uring::BoundedBufMut + Send,
|
||||
{
|
||||
match self {
|
||||
IoEngineKind::StdFs => {
|
||||
// SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
|
||||
let dst = unsafe {
|
||||
std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
|
||||
};
|
||||
let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
|
||||
if let Ok(nbytes) = &res {
|
||||
assert!(*nbytes <= buf.bytes_total());
|
||||
// SAFETY: see above assertion
|
||||
unsafe {
|
||||
buf.set_init(*nbytes);
|
||||
}
|
||||
}
|
||||
#[allow(dropping_references)]
|
||||
drop(dst);
|
||||
((file_guard, buf), res)
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngineKind::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let (resources, res) = system.read(file_guard, offset, buf).await;
|
||||
(
|
||||
resources,
|
||||
res.map_err(|e| match e {
|
||||
tokio_epoll_uring::Error::Op(e) => e,
|
||||
tokio_epoll_uring::Error::System(system) => {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, system)
|
||||
}
|
||||
}),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
138
pageserver/src/virtual_file/open_options.rs
Normal file
138
pageserver/src/virtual_file/open_options.rs
Normal file
@@ -0,0 +1,138 @@
|
||||
//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
|
||||
|
||||
use super::IoEngineKind;
|
||||
use std::{os::fd::OwnedFd, path::Path};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum OpenOptions {
|
||||
StdFs(std::fs::OpenOptions),
|
||||
#[cfg(target_os = "linux")]
|
||||
TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions),
|
||||
}
|
||||
|
||||
impl Default for OpenOptions {
|
||||
fn default() -> Self {
|
||||
match super::io_engine::get() {
|
||||
IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngineKind::TokioEpollUring => {
|
||||
Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OpenOptions {
|
||||
pub fn new() -> OpenOptions {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
pub fn read(&mut self, read: bool) -> &mut OpenOptions {
|
||||
match self {
|
||||
OpenOptions::StdFs(x) => {
|
||||
let _ = x.read(read);
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
OpenOptions::TokioEpollUring(x) => {
|
||||
let _ = x.read(read);
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
pub fn write(&mut self, write: bool) -> &mut OpenOptions {
|
||||
match self {
|
||||
OpenOptions::StdFs(x) => {
|
||||
let _ = x.write(write);
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
OpenOptions::TokioEpollUring(x) => {
|
||||
let _ = x.write(write);
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
pub fn create(&mut self, create: bool) -> &mut OpenOptions {
|
||||
match self {
|
||||
OpenOptions::StdFs(x) => {
|
||||
let _ = x.create(create);
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
OpenOptions::TokioEpollUring(x) => {
|
||||
let _ = x.create(create);
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions {
|
||||
match self {
|
||||
OpenOptions::StdFs(x) => {
|
||||
let _ = x.create_new(create_new);
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
OpenOptions::TokioEpollUring(x) => {
|
||||
let _ = x.create_new(create_new);
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions {
|
||||
match self {
|
||||
OpenOptions::StdFs(x) => {
|
||||
let _ = x.truncate(truncate);
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
OpenOptions::TokioEpollUring(x) => {
|
||||
let _ = x.truncate(truncate);
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result<OwnedFd> {
|
||||
match self {
|
||||
OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
|
||||
#[cfg(target_os = "linux")]
|
||||
OpenOptions::TokioEpollUring(x) => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
system.open(path, x).await.map_err(|e| match e {
|
||||
tokio_epoll_uring::Error::Op(e) => e,
|
||||
tokio_epoll_uring::Error::System(system) => {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, system)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
|
||||
fn mode(&mut self, mode: u32) -> &mut OpenOptions {
|
||||
match self {
|
||||
OpenOptions::StdFs(x) => {
|
||||
let _ = x.mode(mode);
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
OpenOptions::TokioEpollUring(x) => {
|
||||
let _ = x.mode(mode);
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions {
|
||||
match self {
|
||||
OpenOptions::StdFs(x) => {
|
||||
let _ = x.custom_flags(flags);
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
OpenOptions::TokioEpollUring(x) => {
|
||||
let _ = x.custom_flags(flags);
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
}
|
||||
@@ -33,11 +33,12 @@ use utils::failpoint_support;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::pgdatadir_mapping::{DatadirModification, Version};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walrecord::*;
|
||||
use crate::ZERO_PAGE;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
@@ -102,7 +103,9 @@ impl WalIngest {
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
assert!(!self.checkpoint_modified);
|
||||
if self.checkpoint.update_next_xid(decoded.xl_xid) {
|
||||
if decoded.xl_xid != pg_constants::INVALID_TRANSACTION_ID
|
||||
&& self.checkpoint.update_next_xid(decoded.xl_xid)
|
||||
{
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
|
||||
@@ -330,8 +333,13 @@ impl WalIngest {
|
||||
< 0
|
||||
{
|
||||
self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
|
||||
// Write a new checkpoint key-value pair on every checkpoint record, even
|
||||
// if nothing really changed. Not strictly required, but it seems nice to
|
||||
// have some trace of the checkpoint records in the layer files at the same
|
||||
// LSNs.
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
pg_constants::RM_LOGICALMSG_ID => {
|
||||
@@ -1025,7 +1033,23 @@ impl WalIngest {
|
||||
// Copy content
|
||||
debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
|
||||
for blknum in 0..nblocks {
|
||||
debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
|
||||
// Sharding:
|
||||
// - src and dst are always on the same shard, because they differ only by dbNode, and
|
||||
// dbNode is not included in the hash inputs for sharding.
|
||||
// - This WAL command is replayed on all shards, but each shard only copies the blocks
|
||||
// that belong to it.
|
||||
let src_key = rel_block_to_key(src_rel, blknum);
|
||||
if !self.shard.is_key_local(&src_key) {
|
||||
debug!(
|
||||
"Skipping non-local key {} during XLOG_DBASE_CREATE",
|
||||
src_key
|
||||
);
|
||||
continue;
|
||||
}
|
||||
debug!(
|
||||
"copying block {} from {} ({}) to {}",
|
||||
blknum, src_rel, src_key, dst_rel
|
||||
);
|
||||
|
||||
let content = modification
|
||||
.tline
|
||||
|
||||
@@ -47,11 +47,10 @@ use crate::metrics::{
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
|
||||
WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
|
||||
};
|
||||
use crate::pgdatadir_mapping::key_to_slru_block;
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
|
||||
use pageserver_api::key::key_to_rel_block;
|
||||
use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
@@ -837,9 +836,8 @@ impl WalRedoProcess {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let mut nwrite = 0usize;
|
||||
|
||||
let mut stdin_pollfds = [PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT)];
|
||||
|
||||
while nwrite < writebuf.len() {
|
||||
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
@@ -878,7 +876,6 @@ impl WalRedoProcess {
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let mut stdout_pollfds = [PollFd::new(output.stdout.as_raw_fd(), PollFlags::POLLIN)];
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
@@ -886,6 +883,7 @@ impl WalRedoProcess {
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
while nresult < BLCKSZ.into() {
|
||||
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
|
||||
@@ -637,7 +637,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
ListCell *option;
|
||||
const char *role_name = stmt->role->rolename;
|
||||
|
||||
if (RoleIsNeonSuperuser(role_name))
|
||||
if (RoleIsNeonSuperuser(role_name) && !superuser())
|
||||
elog(ERROR, "can't ALTER neon_superuser");
|
||||
|
||||
foreach(option, stmt->options)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user