Compare commits

..

7 Commits

Author SHA1 Message Date
Conrad Ludgate
4283e4c19e use smallvec and add comment about memory usage 2024-01-23 09:34:34 +00:00
Conrad Ludgate
565d2b2494 fix ip tests 2024-01-22 16:49:35 +00:00
Conrad Ludgate
d5db782f26 fix mock 2024-01-22 15:17:06 +00:00
Conrad Ludgate
2f0e3428bb no error string 2024-01-22 15:00:45 +00:00
Conrad Ludgate
3928d3bc8a eager parsing of ip addr 2024-01-22 14:53:50 +00:00
Conrad Ludgate
e7d36cc41f fix tests 2024-01-22 14:36:36 +00:00
Conrad Ludgate
5e150c9376 switch to lru 2024-01-22 14:28:50 +00:00
203 changed files with 3371 additions and 10923 deletions

View File

@@ -1,28 +1,27 @@
*
# Files
!Cargo.lock
!Cargo.toml
!Makefile
!rust-toolchain.toml
!scripts/combine_control_files.py
!scripts/ninstall.sh
!vm-cgconfig.conf
!Cargo.toml
!Cargo.lock
!Makefile
# Directories
!.cargo/
!.config/
!compute_tools/
!control_plane/
!compute_tools/
!libs/
!neon_local/
!pageserver/
!patches/
!pgxn/
!proxy/
!s3_scrubber/
!safekeeper/
!s3_scrubber/
!storage_broker/
!trace/
!vendor/postgres-*/
!vendor/postgres-v14/
!vendor/postgres-v15/
!vendor/postgres-v16/
!workspace_hack/
!neon_local/
!scripts/ninstall.sh
!scripts/combine_control_files.py
!vm-cgconfig.conf

View File

@@ -179,6 +179,23 @@ runs:
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
fi
- name: Store Allure test stat in the DB
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
shell: bash -euxo pipefail {0}
env:
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
run: |
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
./scripts/pysync
poetry run python3 scripts/ingest_regress_test_result.py \
--revision ${COMMIT_SHA} \
--reference ${GITHUB_REF} \
--build-type unified \
--ingest ${WORKDIR}/report/data/suites.json
- name: Store Allure test stat in the DB (new)
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
shell: bash -euxo pipefail {0}

View File

@@ -69,15 +69,7 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: |
/kaniko/executor \
--reproducible \
--snapshotMode=redo \
--skip-unused-stages \
--dockerfile ${{ inputs.dockerfile-path }} \
--cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
kaniko-arm:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -93,15 +85,7 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: |
/kaniko/executor \
--reproducible \
--snapshotMode=redo \
--skip-unused-stages \
--dockerfile ${{ inputs.dockerfile-path }} \
--cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
manifest:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -115,10 +99,7 @@ jobs:
steps:
- name: Create manifest
run: |
docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
- name: Push manifest
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}

View File

@@ -21,8 +21,6 @@ env:
COPT: '-Werror'
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
jobs:
check-permissions:
@@ -46,20 +44,6 @@ jobs:
exit 1
cancel-previous-e2e-tests:
needs: [ check-permissions ]
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- name: Cancel previous e2e-tests runs for this PR
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
gh workflow --repo neondatabase/cloud \
run cancel-previous-in-concurrency-group.yml \
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
tag:
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, small ]
@@ -202,11 +186,7 @@ jobs:
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# Raise locked memory limit for tokio-epoll-uring.
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
# io_uring will account the memory of the CQ and SQ as locked.
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
options: --init
strategy:
fail-fast: false
matrix:
@@ -360,12 +340,8 @@ jobs:
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run rust tests
env:
NEXTEST_RETRIES: 3
run: |
for io_engine in std-fs tokio-epoll-uring ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
done
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -443,8 +419,8 @@ jobs:
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
# Default shared memory is 64mb
options: --init --shm-size=512mb
strategy:
fail-fast: false
matrix:
@@ -472,7 +448,6 @@ jobs:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
- name: Merge and upload coverage data
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
@@ -483,13 +458,12 @@ jobs:
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
# Default shared memory is 64mb
options: --init --shm-size=512mb
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
strategy:
fail-fast: false
matrix:
# the amount of groups (N) should be reflected in `extra_params: --splits N ...`
pytest_split_group: [ 1, 2, 3, 4 ]
build_type: [ release ]
steps:
@@ -503,12 +477,11 @@ jobs:
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ github.ref_name == 'main' }}
extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
@@ -531,6 +504,7 @@ jobs:
with:
store-test-results-into-db: true
env:
REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- uses: actions/github-script@v6
@@ -608,6 +582,17 @@ jobs:
--input-objects=/tmp/coverage/binaries.list \
--format=lcov
- name: Upload coverage report
id: upload-coverage-report
env:
BUCKET: neon-github-public-dev
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
- name: Build coverage report NEW
id: upload-coverage-report-new
env:
@@ -644,11 +629,21 @@ jobs:
- uses: actions/github-script@v6
env:
REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
script: |
const { REPORT_URL_NEW, COMMIT_SHA } = process.env
const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: `${COMMIT_SHA}`,
state: 'success',
target_url: `${REPORT_URL}`,
context: 'Code coverage report',
})
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
@@ -700,8 +695,7 @@ jobs:
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
}
}"

View File

@@ -124,12 +124,12 @@ jobs:
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
CARGO_FEATURES: --features testing
CARGO_FLAGS: --release
CARGO_FLAGS: --locked --release
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -210,20 +210,18 @@ jobs:
- name: Run cargo build
run: |
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run cargo test
env:
NEXTEST_RETRIES: 3
run: |
cargo nextest run $CARGO_FEATURES
cargo test $CARGO_FLAGS $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_s3
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
@@ -233,7 +231,7 @@ jobs:
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_azure
cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
check-codestyle-rust-arm:
timeout-minutes: 90

View File

@@ -20,51 +20,111 @@ defaults:
run:
shell: bash -euo pipefail {0}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
permissions: {}
jobs:
tag-image:
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
env:
ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
outputs:
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
steps:
- name: Install Crane & ECR helper
run: |
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Get source image digest
id: next-digest
run: |
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
if [ -z "${NEXT_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
exit 1
fi
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
- name: Get destination image digest (if already exists)
id: prev-digest
run: |
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
if [ -z "${PREV_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
else
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
fi
- name: Tag image
run: |
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
rollback-tag-image:
needs: tag-image
if: ${{ !success() }}
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
env:
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
steps:
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
- name: Install Crane & ECR helper
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
- uses: docker/login-action@v2
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/login-action@v2
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- uses: actions/setup-go@v5
with:
go-version: '1.21'
- name: Install crane
- name: Configure ECR login
run: |
go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Copy images
- name: Restore previous tag if needed
run: |
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
if [ -z "${NEXT_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
exit 0
fi
if [ -z "${PREV_DIGEST}" ]; then
# I guess we should delete the tag here/untag the image, but crane does not support it
# - https://github.com/google/go-containerregistry/issues/999
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
exit 0
fi
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
else
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
fi

396
Cargo.lock generated
View File

@@ -10,9 +10,9 @@ checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5"
[[package]]
name = "addr2line"
version = "0.21.0"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97"
dependencies = [
"gimli",
]
@@ -278,13 +278,13 @@ dependencies = [
"camino",
"clap",
"control_plane",
"diesel",
"futures",
"git-version",
"hyper",
"metrics",
"pageserver_api",
"pageserver_client",
"postgres_backend",
"postgres_connection",
"serde",
"serde_json",
@@ -840,15 +840,15 @@ dependencies = [
[[package]]
name = "backtrace"
version = "0.3.69"
version = "0.3.67"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca"
dependencies = [
"addr2line",
"cc",
"cfg-if",
"libc",
"miniz_oxide",
"miniz_oxide 0.6.2",
"object",
"rustc-demangle",
]
@@ -1144,6 +1144,16 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
[[package]]
name = "close_fds"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed"
dependencies = [
"cfg-if",
"libc",
]
[[package]]
name = "colorchoice"
version = "1.0.0"
@@ -1205,7 +1215,7 @@ dependencies = [
"flate2",
"futures",
"hyper",
"nix 0.27.1",
"nix 0.26.2",
"notify",
"num_cpus",
"opentelemetry",
@@ -1317,13 +1327,11 @@ dependencies = [
"clap",
"comfy-table",
"compute_api",
"diesel",
"diesel_migrations",
"futures",
"git-version",
"hex",
"hyper",
"nix 0.27.1",
"nix 0.26.2",
"once_cell",
"pageserver_api",
"pageserver_client",
@@ -1333,7 +1341,6 @@ dependencies = [
"regex",
"reqwest",
"safekeeper_api",
"scopeguard",
"serde",
"serde_json",
"serde_with",
@@ -1629,52 +1636,6 @@ dependencies = [
"rusticata-macros",
]
[[package]]
name = "diesel"
version = "2.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
dependencies = [
"bitflags 2.4.1",
"byteorder",
"diesel_derives",
"itoa",
"pq-sys",
"serde_json",
]
[[package]]
name = "diesel_derives"
version = "2.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
dependencies = [
"diesel_table_macro_syntax",
"proc-macro2",
"quote",
"syn 2.0.32",
]
[[package]]
name = "diesel_migrations"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
dependencies = [
"diesel",
"migrations_internals",
"migrations_macros",
]
[[package]]
name = "diesel_table_macro_syntax"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
dependencies = [
"syn 2.0.32",
]
[[package]]
name = "digest"
version = "0.10.7"
@@ -1911,13 +1872,13 @@ dependencies = [
[[package]]
name = "filetime"
version = "0.2.22"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0"
checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153"
dependencies = [
"cfg-if",
"libc",
"redox_syscall 0.3.5",
"redox_syscall 0.2.16",
"windows-sys 0.48.0",
]
@@ -1934,7 +1895,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
dependencies = [
"crc32fast",
"miniz_oxide",
"miniz_oxide 0.7.1",
]
[[package]]
@@ -2132,9 +2093,9 @@ dependencies = [
[[package]]
name = "gimli"
version = "0.28.1"
version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
[[package]]
name = "git-version"
@@ -2601,16 +2562,6 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "io-uring"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460648e47a07a43110fbfa2e0b14afb2be920093c31e5dccc50e49568e099762"
dependencies = [
"bitflags 1.3.2",
"libc",
]
[[package]]
name = "ipnet"
version = "2.9.0"
@@ -2725,12 +2676,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "libm"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "linux-raw-sys"
version = "0.1.4"
@@ -2819,15 +2764,6 @@ dependencies = [
"autocfg",
]
[[package]]
name = "memoffset"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
dependencies = [
"autocfg",
]
[[package]]
name = "metrics"
version = "0.1.0"
@@ -2836,33 +2772,9 @@ dependencies = [
"libc",
"once_cell",
"prometheus",
"rand 0.8.5",
"rand_distr",
"twox-hash",
"workspace_hack",
]
[[package]]
name = "migrations_internals"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
dependencies = [
"serde",
"toml",
]
[[package]]
name = "migrations_macros"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
dependencies = [
"migrations_internals",
"proc-macro2",
"quote",
]
[[package]]
name = "mime"
version = "0.3.17"
@@ -2885,6 +2797,15 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
[[package]]
name = "miniz_oxide"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa"
dependencies = [
"adler",
]
[[package]]
name = "miniz_oxide"
version = "0.7.1"
@@ -2944,27 +2865,16 @@ dependencies = [
[[package]]
name = "nix"
version = "0.26.4"
version = "0.26.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
dependencies = [
"bitflags 1.3.2",
"cfg-if",
"libc",
"memoffset 0.7.1",
"pin-utils",
]
[[package]]
name = "nix"
version = "0.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
dependencies = [
"bitflags 2.4.1",
"cfg-if",
"libc",
"memoffset 0.9.0",
"static_assertions",
]
[[package]]
@@ -2979,21 +2889,20 @@ dependencies = [
[[package]]
name = "notify"
version = "6.1.1"
version = "5.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486"
dependencies = [
"bitflags 2.4.1",
"bitflags 1.3.2",
"crossbeam-channel",
"filetime",
"fsevent-sys",
"inotify 0.9.6",
"kqueue",
"libc",
"log",
"mio",
"walkdir",
"windows-sys 0.48.0",
"windows-sys 0.45.0",
]
[[package]]
@@ -3077,7 +2986,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
dependencies = [
"autocfg",
"libm",
]
[[package]]
@@ -3120,9 +3028,9 @@ dependencies = [
[[package]]
name = "object"
version = "0.32.2"
version = "0.30.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439"
dependencies = [
"memchr",
]
@@ -3194,9 +3102,9 @@ dependencies = [
[[package]]
name = "opentelemetry"
version = "0.20.0"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
dependencies = [
"opentelemetry_api",
"opentelemetry_sdk",
@@ -3204,9 +3112,9 @@ dependencies = [
[[package]]
name = "opentelemetry-http"
version = "0.9.0"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b"
checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
dependencies = [
"async-trait",
"bytes",
@@ -3217,56 +3125,54 @@ dependencies = [
[[package]]
name = "opentelemetry-otlp"
version = "0.13.0"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
dependencies = [
"async-trait",
"futures-core",
"futures",
"futures-util",
"http",
"opentelemetry",
"opentelemetry-http",
"opentelemetry-proto",
"opentelemetry-semantic-conventions",
"opentelemetry_api",
"opentelemetry_sdk",
"prost",
"reqwest",
"thiserror",
"tokio",
"tonic",
]
[[package]]
name = "opentelemetry-proto"
version = "0.3.0"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb"
checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
dependencies = [
"opentelemetry_api",
"opentelemetry_sdk",
"futures",
"futures-util",
"opentelemetry",
"prost",
"tonic",
"tonic 0.8.3",
]
[[package]]
name = "opentelemetry-semantic-conventions"
version = "0.12.0"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269"
checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
dependencies = [
"opentelemetry",
]
[[package]]
name = "opentelemetry_api"
version = "0.20.0"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
dependencies = [
"fnv",
"futures-channel",
"futures-util",
"indexmap 1.9.3",
"js-sys",
"once_cell",
"pin-project-lite",
"thiserror",
@@ -3275,22 +3181,21 @@ dependencies = [
[[package]]
name = "opentelemetry_sdk"
version = "0.20.0"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
dependencies = [
"async-trait",
"crossbeam-channel",
"dashmap",
"fnv",
"futures-channel",
"futures-executor",
"futures-util",
"once_cell",
"opentelemetry_api",
"ordered-float 3.9.2",
"percent-encoding",
"rand 0.8.5",
"regex",
"serde_json",
"thiserror",
"tokio",
"tokio-stream",
@@ -3305,15 +3210,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "ordered-float"
version = "3.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
dependencies = [
"num-traits",
]
[[package]]
name = "ordered-multimap"
version = "0.7.1"
@@ -3408,6 +3304,7 @@ dependencies = [
"camino-tempfile",
"chrono",
"clap",
"close_fds",
"const_format",
"consumption_metrics",
"crc32c",
@@ -3428,7 +3325,7 @@ dependencies = [
"itertools",
"md5",
"metrics",
"nix 0.27.1",
"nix 0.26.2",
"num-traits",
"num_cpus",
"once_cell",
@@ -3461,7 +3358,6 @@ dependencies = [
"tenant_size_model",
"thiserror",
"tokio",
"tokio-epoll-uring",
"tokio-io-timeout",
"tokio-postgres",
"tokio-stream",
@@ -3483,7 +3379,6 @@ dependencies = [
"bincode",
"byteorder",
"bytes",
"chrono",
"const_format",
"enum-map",
"hex",
@@ -3885,15 +3780,6 @@ version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "pq-sys"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
dependencies = [
"vcpkg",
]
[[package]]
name = "pq_proto"
version = "0.1.0"
@@ -4058,6 +3944,7 @@ dependencies = [
"metrics",
"native-tls",
"once_cell",
"opentelemetry",
"parking_lot 0.12.1",
"parquet",
"parquet_derive",
@@ -4086,13 +3973,12 @@ dependencies = [
"serde",
"serde_json",
"sha2",
"smallvec",
"smol_str",
"socket2 0.5.5",
"sync_wrapper",
"task-local-extensions",
"thiserror",
"tikv-jemalloc-ctl",
"tikv-jemallocator",
"tls-listener",
"tokio",
"tokio-postgres",
@@ -4100,6 +3986,7 @@ dependencies = [
"tokio-rustls",
"tokio-util",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
"tracing-utils",
"url",
@@ -4192,16 +4079,6 @@ dependencies = [
"getrandom 0.2.11",
]
[[package]]
name = "rand_distr"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
dependencies = [
"num-traits",
"rand 0.8.5",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
@@ -4463,9 +4340,9 @@ dependencies = [
[[package]]
name = "reqwest-tracing"
version = "0.4.7"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3"
checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
dependencies = [
"anyhow",
"async-trait",
@@ -5231,12 +5108,15 @@ name = "smallvec"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
dependencies = [
"serde",
]
[[package]]
name = "smol_str"
version = "0.2.1"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49"
checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
dependencies = [
"serde",
]
@@ -5319,7 +5199,7 @@ dependencies = [
"prost",
"tokio",
"tokio-stream",
"tonic",
"tonic 0.9.2",
"tonic-build",
"tracing",
"utils",
@@ -5503,18 +5383,18 @@ dependencies = [
[[package]]
name = "thiserror"
version = "1.0.47"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.47"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
dependencies = [
"proc-macro2",
"quote",
@@ -5539,38 +5419,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
dependencies = [
"byteorder",
"integer-encoding",
"ordered-float 2.10.1",
]
[[package]]
name = "tikv-jemalloc-ctl"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c"
dependencies = [
"libc",
"paste",
"tikv-jemalloc-sys",
]
[[package]]
name = "tikv-jemalloc-sys"
version = "0.5.4+5.3.0-patched"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "tikv-jemallocator"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca"
dependencies = [
"libc",
"tikv-jemalloc-sys",
"ordered-float",
]
[[package]]
@@ -5669,22 +5518,6 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "tokio-epoll-uring"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
dependencies = [
"futures",
"nix 0.26.4",
"once_cell",
"scopeguard",
"thiserror",
"tokio",
"tokio-util",
"tracing",
"uring-common",
]
[[package]]
name = "tokio-io-timeout"
version = "1.2.0"
@@ -5852,6 +5685,38 @@ dependencies = [
"winnow",
]
[[package]]
name = "tonic"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb"
dependencies = [
"async-stream",
"async-trait",
"axum",
"base64 0.13.1",
"bytes",
"futures-core",
"futures-util",
"h2",
"http",
"http-body",
"hyper",
"hyper-timeout",
"percent-encoding",
"pin-project",
"prost",
"prost-derive",
"tokio",
"tokio-stream",
"tokio-util",
"tower",
"tower-layer",
"tower-service",
"tracing",
"tracing-futures",
]
[[package]]
name = "tonic"
version = "0.9.2"
@@ -5995,6 +5860,16 @@ dependencies = [
"tracing-subscriber",
]
[[package]]
name = "tracing-futures"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2"
dependencies = [
"pin-project",
"tracing",
]
[[package]]
name = "tracing-log"
version = "0.1.3"
@@ -6008,9 +5883,9 @@ dependencies = [
[[package]]
name = "tracing-opentelemetry"
version = "0.20.0"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19"
checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
dependencies = [
"once_cell",
"opentelemetry",
@@ -6194,15 +6069,6 @@ dependencies = [
"webpki-roots 0.23.1",
]
[[package]]
name = "uring-common"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
dependencies = [
"io-uring",
"libc",
]
[[package]]
name = "url"
version = "2.3.1"
@@ -6256,7 +6122,7 @@ dependencies = [
"hyper",
"jsonwebtoken",
"metrics",
"nix 0.27.1",
"nix 0.26.2",
"once_cell",
"pin-project-lite",
"postgres_connection",
@@ -6764,9 +6630,10 @@ dependencies = [
"clap",
"clap_builder",
"crossbeam-utils",
"diesel",
"dashmap",
"either",
"fail",
"futures",
"futures-channel",
"futures-core",
"futures-executor",
@@ -6811,7 +6678,6 @@ dependencies = [
"tokio-util",
"toml_datetime",
"toml_edit",
"tonic",
"tower",
"tracing",
"tracing-core",

View File

@@ -64,6 +64,7 @@ camino = "1.1.6"
cfg-if = "1.0.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive"] }
close_fds = "0.3.2"
comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
@@ -98,14 +99,14 @@ libc = "0.2"
md5 = "0.7.0"
memoffset = "0.8"
native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
notify = "6.0.0"
nix = "0.26"
notify = "5.0.0"
num_cpus = "1.15"
num-traits = "0.2.15"
once_cell = "1.13"
opentelemetry = "0.20.0"
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.12.0"
opentelemetry = "0.19.0"
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.11.0"
parking_lot = "0.12"
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "49.0.0"
@@ -117,7 +118,7 @@ rand = "0.8"
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
reqwest-middleware = "0.2.0"
reqwest-retry = "0.2.2"
routerify = "3"
@@ -148,11 +149,8 @@ tar = "0.4"
task-local-extensions = "0.1.4"
test-context = "0.1"
thiserror = "1.0"
tikv-jemallocator = "0.5"
tikv-jemalloc-ctl = "0.5"
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
tokio = { version = "1.17", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.10.0"
tokio-rustls = "0.24"
@@ -164,9 +162,8 @@ toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.20.0"
tracing-opentelemetry = "0.19.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
twox-hash = { version = "1.6.3", default-features = false }
url = "2.2"
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"

View File

@@ -53,7 +53,6 @@ RUN set -e \
--bin pagectl \
--bin safekeeper \
--bin storage_broker \
--bin attachment_service \
--bin proxy \
--bin neon_local \
--locked --release \
@@ -81,7 +80,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin

View File

@@ -52,7 +52,7 @@ RUN cd postgres && \
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
# In vanilla postgres this function is limited to Postgres role superuser.
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
# so we do it here.
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
@@ -63,14 +63,14 @@ RUN cd postgres && \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
fi; \
done; \
# the second loop is for pg_stat_statement extension versions >= 1.7,
# the second loop is for pg_stat_statement extension versions >= 1.7,
# where pg_stat_statement_reset() got 3 additional arguments
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
if ! echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
fi; \
done
done
#########################################################################################
#
@@ -144,23 +144,30 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
FROM build-deps AS plv8-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ARG PG_VERSION
RUN apt update && \
apt install -y ninja-build python3-dev libncurses5 binutils clang
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export PLV8_VERSION=3.1.5 \
export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
;; \
"v16") \
export PLV8_VERSION=3.1.8 \
export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
;; \
*) \
echo "Export the valid PG_VERSION variable" && exit 1 \
;; \
esac && \
wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
# generate and copy upgrade scripts
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
cp upgrade/* /usr/local/pgsql/share/extension/ && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
rm -rf /plv8-* && \
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
# don't break computes with installed old version of plv8
cd /usr/local/pgsql/lib/ && \
ln -s plv8-3.1.10.so plv8-3.1.5.so && \
ln -s plv8-3.1.10.so plv8-3.1.8.so && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -241,12 +248,9 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
FROM build-deps AS vector-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY patches/pgvector.patch /pgvector.patch
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
patch -p1 < /pgvector.patch && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
@@ -523,7 +527,8 @@ RUN apt-get update && \
libboost-regex1.74-dev \
libboost-serialization1.74-dev \
libboost-system1.74-dev \
libeigen3-dev
libeigen3-dev \
libfreetype6-dev
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
@@ -548,8 +553,6 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
-D RDK_INSTALL_INTREE=OFF \
-D RDK_INSTALL_COMIC_FONTS=OFF \
-D RDK_BUILD_FREETYPE_SUPPORT=OFF \
-D CMAKE_BUILD_TYPE=Release \
. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -904,7 +907,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libxml2, libxslt1.1 for xml2
# libzstd1 for zstd
# libboost* for rdkit
# libboost*, libfreetype6, and zlib1g for rdkit
# ca-certificates for communicating with s3 by compute_ctl
RUN apt update && \
apt install --no-install-recommends -y \
@@ -917,6 +920,7 @@ RUN apt update && \
libboost-serialization1.74.0 \
libboost-system1.74.0 \
libossp-uuid16 \
libfreetype6 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
@@ -928,6 +932,7 @@ RUN apt update && \
libcurl4-openssl-dev \
locales \
procps \
zlib1g \
ca-certificates && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

View File

@@ -51,8 +51,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
# Force cargo not to print progress bar
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
#
# Top level Makefile to build Neon and PostgreSQL
@@ -176,10 +174,10 @@ neon-pg-ext-clean-%:
# Build walproposer as a static library. walproposer source code is located
# in the pgxn/neon directory.
#
#
# We also need to include libpgport.a and libpgcommon.a, because walproposer
# uses some functions from those libraries.
#
#
# Some object files are removed from libpgport.a and libpgcommon.a because
# they depend on openssl and other libraries that are not included in our
# Rust build.

View File

@@ -319,7 +319,7 @@ impl ComputeNode {
// Get basebackup from the libpq connection to pageserver using `connstr` and
// unarchive it to `pgdata` directory overriding all its previous content.
#[instrument(skip_all, fields(%lsn))]
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let start_time = Instant::now();
@@ -390,34 +390,6 @@ impl ComputeNode {
Ok(())
}
// Gets the basebackup in a retry loop
#[instrument(skip_all, fields(%lsn))]
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let mut retry_period_ms = 500;
let mut attempts = 0;
let max_attempts = 5;
loop {
let result = self.try_get_basebackup(compute_state, lsn);
match result {
Ok(_) => {
return result;
}
Err(ref e) if attempts < max_attempts => {
warn!(
"Failed to get basebackup: {} (attempt {}/{})",
e, attempts, max_attempts
);
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
retry_period_ms *= 2;
}
Err(_) => {
return result;
}
}
attempts += 1;
}
}
pub async fn check_safekeepers_synced_async(
&self,
compute_state: &ComputeState,
@@ -728,14 +700,13 @@ impl ComputeNode {
// In this case we need to connect with old `zenith_admin` name
// and create new user. We cannot simply rename connected user,
// but we can create a new one and grant it all privileges.
let connstr = self.connstr.clone();
let mut client = match Client::connect(connstr.as_str(), NoTls) {
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
Err(e) => {
info!(
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
e
);
let mut zenith_admin_connstr = connstr.clone();
let mut zenith_admin_connstr = self.connstr.clone();
zenith_admin_connstr
.set_username("zenith_admin")
@@ -748,8 +719,8 @@ impl ComputeNode {
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
drop(client);
// reconnect with connstring with expected name
Client::connect(connstr.as_str(), NoTls)?
// reconnect with connsting with expected name
Client::connect(self.connstr.as_str(), NoTls)?
}
Ok(client) => client,
};
@@ -763,8 +734,8 @@ impl ComputeNode {
cleanup_instance(&mut client)?;
handle_roles(spec, &mut client)?;
handle_databases(spec, &mut client)?;
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
handle_grants(spec, &mut client, connstr.as_str())?;
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
handle_grants(spec, &mut client, self.connstr.as_str())?;
handle_extensions(spec, &mut client)?;
handle_extension_neon(&mut client)?;
create_availability_check_data(&mut client)?;
@@ -772,12 +743,6 @@ impl ComputeNode {
// 'Close' connection
drop(client);
if self.has_feature(ComputeFeature::Migrations) {
thread::spawn(move || {
let mut client = Client::connect(connstr.as_str(), NoTls)?;
handle_migrations(&mut client)
});
}
Ok(())
}
@@ -842,10 +807,6 @@ impl ComputeNode {
handle_grants(&spec, &mut client, self.connstr.as_str())?;
handle_extensions(&spec, &mut client)?;
handle_extension_neon(&mut client)?;
// We can skip handle_migrations here because a new migration can only appear
// if we have a new version of the compute_ctl binary, which can only happen
// if compute got restarted, in which case we'll end up inside of apply_config
// instead of reconfigure.
}
// 'Close' connection

View File

@@ -727,87 +727,3 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
Ok(())
}
#[instrument(skip_all)]
pub fn handle_migrations(client: &mut Client) -> Result<()> {
info!("handle migrations");
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
let migrations = [
"ALTER ROLE neon_superuser BYPASSRLS",
r#"
DO $$
DECLARE
role_name text;
BEGIN
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
END LOOP;
FOR role_name IN SELECT rolname FROM pg_roles
WHERE
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
END LOOP;
END $$;
"#,
r#"
DO $$
BEGIN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
END IF;
END
$$;"#,
];
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
client.simple_query(query)?;
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
client.simple_query(query)?;
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
client.simple_query(query)?;
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
client.simple_query(query)?;
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
client.simple_query(query)?;
query = "SELECT id FROM neon_migration.migration_id";
let row = client.query_one(query, &[])?;
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
let starting_migration_id = current_migration;
query = "BEGIN";
client.simple_query(query)?;
while current_migration < migrations.len() {
info!("Running migration:\n{}\n", migrations[current_migration]);
client.simple_query(migrations[current_migration])?;
current_migration += 1;
}
let setval = format!(
"UPDATE neon_migration.migration_id SET id={}",
migrations.len()
);
client.simple_query(&setval)?;
query = "COMMIT";
client.simple_query(query)?;
info!(
"Ran {} migrations",
(migrations.len() - starting_migration_id)
);
Ok(())
}

View File

@@ -10,8 +10,6 @@ async-trait.workspace = true
camino.workspace = true
clap.workspace = true
comfy-table.workspace = true
diesel = { version = "2.1.4", features = ["postgres"]}
diesel_migrations = { version = "2.1.0", features = ["postgres"]}
futures.workspace = true
git-version.workspace = true
nix.workspace = true
@@ -21,7 +19,6 @@ hex.workspace = true
hyper.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["blocking", "json"] }
scopeguard.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_with.workspace = true

View File

@@ -21,7 +21,9 @@ tokio.workspace = true
tokio-util.workspace = true
tracing.workspace = true
diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
# TODO: remove this after DB persistence is added, it is only used for
# a parsing function when loading pageservers from neon_local LocalEnv
postgres_backend.workspace = true
utils = { path = "../../libs/utils/" }
metrics = { path = "../../libs/metrics/" }

View File

@@ -1,6 +0,0 @@
-- This file was automatically created by Diesel to setup helper functions
-- and other internal bookkeeping. This file is safe to edit, any future
-- changes will be added to existing projects as new migrations.
DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
DROP FUNCTION IF EXISTS diesel_set_updated_at();

View File

@@ -1,36 +0,0 @@
-- This file was automatically created by Diesel to setup helper functions
-- and other internal bookkeeping. This file is safe to edit, any future
-- changes will be added to existing projects as new migrations.
-- Sets up a trigger for the given table to automatically set a column called
-- `updated_at` whenever the row is modified (unless `updated_at` was included
-- in the modified columns)
--
-- # Example
--
-- ```sql
-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
--
-- SELECT diesel_manage_updated_at('users');
-- ```
CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
BEGIN
EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
BEGIN
IF (
NEW IS DISTINCT FROM OLD AND
NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
) THEN
NEW.updated_at := current_timestamp;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;

View File

@@ -1,12 +0,0 @@
CREATE TABLE tenant_shards (
tenant_id VARCHAR NOT NULL,
shard_number INTEGER NOT NULL,
shard_count INTEGER NOT NULL,
PRIMARY KEY(tenant_id, shard_number, shard_count),
shard_stripe_size INTEGER NOT NULL,
generation INTEGER NOT NULL,
generation_pageserver BIGINT NOT NULL,
placement_policy VARCHAR NOT NULL,
-- config is JSON encoded, opaque to the database.
config TEXT NOT NULL
);

View File

@@ -1,10 +0,0 @@
CREATE TABLE nodes (
node_id BIGINT PRIMARY KEY NOT NULL,
scheduling_policy VARCHAR NOT NULL,
listen_http_addr VARCHAR NOT NULL,
listen_http_port INTEGER NOT NULL,
listen_pg_addr VARCHAR NOT NULL,
listen_pg_port INTEGER NOT NULL
);

View File

@@ -1,18 +1,14 @@
use crate::reconciler::ReconcileError;
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
use crate::service::Service;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use pageserver_api::models::{
TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
};
use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use std::sync::Arc;
use std::time::{Duration, Instant};
use utils::auth::SwappableJwtAuth;
use utils::http::endpoint::{auth_middleware, request_span};
use utils::http::request::parse_request_param;
use utils::id::{TenantId, TimelineId};
use utils::id::TenantId;
use utils::{
http::{
@@ -108,163 +104,34 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
json_response(StatusCode::OK, state.service.inspect(inspect_req))
}
async fn handle_tenant_create(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
}
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. This avoids
// needing to track a "deleting" state for tenants.
async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
where
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
{
let started_at = Instant::now();
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
// completed.
let mut retry_period = Duration::from_secs(1);
// On subsequent retries, wait longer.
let max_retry_period = Duration::from_secs(5);
// Enable callers with a 30 second request timeout to reliably get a response
let max_wait = Duration::from_secs(25);
loop {
let status = f(service.clone()).await?;
match status {
StatusCode::ACCEPTED => {
tracing::info!("Deletion accepted, waiting to try again...");
tokio::time::sleep(retry_period).await;
retry_period = max_retry_period;
}
StatusCode::NOT_FOUND => {
tracing::info!("Deletion complete");
return json_response(StatusCode::OK, ());
}
_ => {
tracing::warn!("Unexpected status {status}");
return json_response(status, ());
}
}
let now = Instant::now();
if now + retry_period > started_at + max_wait {
tracing::info!("Deletion timed out waiting for 404");
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
// the pageserver's swagger definition for this endpoint, and has the same desired
// effect of causing the control plane to retry later.
return json_response(StatusCode::CONFLICT, ());
}
}
}
async fn handle_tenant_location_config(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
service
.tenant_location_config(tenant_id, config_req)
.await?,
state.service.tenant_create(create_req).await?,
)
}
async fn handle_tenant_delete(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
deletion_wrapper(service, move |service| async move {
service.tenant_delete(tenant_id).await
})
.await
}
async fn handle_tenant_timeline_create(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
service
state
.service
.tenant_timeline_create(tenant_id, create_req)
.await?,
)
}
async fn handle_tenant_timeline_delete(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
let state = get_state(&req);
deletion_wrapper(service, move |service| async move {
service.tenant_timeline_delete(tenant_id, timeline_id).await
})
.await
}
async fn handle_tenant_timeline_passthrough(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let Some(path) = req.uri().path_and_query() else {
// This should never happen, our request router only calls us if there is a path
return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
};
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
// Find the node that holds shard zero
let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
// Callers will always pass an unsharded tenant ID. Before proxying, we must
// rewrite this to a shard-aware shard zero ID.
let path = format!("{}", path);
let tenant_str = tenant_id.to_string();
let tenant_shard_str = format!("{}", tenant_shard_id);
let path = path.replace(&tenant_str, &tenant_shard_str);
let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
let resp = client.get_raw(path).await.map_err(|_e|
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
// if we can't successfully send a request to the pageserver, we aren't available.
ApiError::ShuttingDown)?;
// We have a reqest::Response, would like a http::Response
let mut builder = hyper::Response::builder()
.status(resp.status())
.version(resp.version());
for (k, v) in resp.headers() {
builder = builder.header(k, v);
}
let response = builder
.body(Body::wrap_stream(resp.bytes_stream()))
.map_err(|e| ApiError::InternalServerError(e.into()))?;
Ok(response)
}
async fn handle_tenant_locate(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -274,11 +141,6 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
json_response(StatusCode::OK, ())
}
async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let state = get_state(&req);
json_response(StatusCode::OK, state.service.node_list().await?)
}
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let node_id: NodeId = parse_request_param(&req, "node_id")?;
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
@@ -292,15 +154,14 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
}
async fn handle_tenant_shard_migrate(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
service
state
.service
.tenant_shard_migrate(tenant_shard_id, migrate_req)
.await?,
)
@@ -317,35 +178,6 @@ impl From<ReconcileError> for ApiError {
}
}
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
/// be allowed to run if Service has finished its initial reconciliation.
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
where
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
{
let state = get_state(&request);
let service = state.service.clone();
let startup_complete = service.startup_complete.clone();
if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
.await
.is_err()
{
// This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
// timeouts around its remote calls, to bound its runtime.
return Err(ApiError::Timeout(
"Timed out waiting for service readiness".into(),
));
}
request_span(
request,
|request| async move { handler(service, request).await },
)
.await
}
pub fn make_router(
service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>,
@@ -364,67 +196,23 @@ pub fn make_router(
router
.data(Arc::new(HttpState::new(service, auth)))
// Non-prefixed generic endpoints (status, metrics)
.get("/status", |r| request_span(r, handle_status))
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
.post("/upcall/v1/re-attach", |r| {
request_span(r, handle_re_attach)
})
.post("/upcall/v1/validate", |r| request_span(r, handle_validate))
// Test/dev/debug endpoints
.post("/debug/v1/attach-hook", |r| {
request_span(r, handle_attach_hook)
})
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
.get("/control/v1/tenant/:tenant_id/locate", |r| {
tenant_service_handler(r, handle_tenant_locate)
})
// Node operations
.post("/control/v1/node", |r| {
request_span(r, handle_node_register)
})
.get("/control/v1/node", |r| request_span(r, handle_node_list))
.put("/control/v1/node/:node_id/config", |r| {
request_span(r, handle_node_configure)
})
// Tenant Shard operations
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
tenant_service_handler(r, handle_tenant_shard_migrate)
})
// Tenant operations
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
.post("/v1/tenant", |r| {
tenant_service_handler(r, handle_tenant_create)
})
.delete("/v1/tenant/:tenant_id", |r| {
tenant_service_handler(r, handle_tenant_delete)
})
.put("/v1/tenant/:tenant_id/location_config", |r| {
tenant_service_handler(r, handle_tenant_location_config)
})
// Tenant Shard operations (low level/maintenance)
.put("/tenant/:tenant_shard_id/migrate", |r| {
tenant_service_handler(r, handle_tenant_shard_migrate)
})
// Timeline operations
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
tenant_service_handler(r, handle_tenant_timeline_delete)
})
.post("/v1/tenant/:tenant_id/timeline", |r| {
tenant_service_handler(r, handle_tenant_timeline_create)
})
// Tenant detail GET passthrough to shard zero
.get("/v1/tenant/:tenant_id*", |r| {
tenant_service_handler(r, handle_tenant_timeline_passthrough)
})
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
// timeline GET APIs will be implicitly included.
.get("/v1/tenant/:tenant_id/timeline*", |r| {
tenant_service_handler(r, handle_tenant_timeline_passthrough)
})
// Path aliases for tests_forward_compatibility
// TODO: remove these in future PR
.post("/re-attach", |r| request_span(r, handle_re_attach))
.post("/validate", |r| request_span(r, handle_validate))
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
.post("/inspect", |r| request_span(r, handle_inspect))
.post("/node", |r| request_span(r, handle_node_register))
.put("/node/:node_id/config", |r| {
request_span(r, handle_node_configure)
})
.post("/tenant", |r| request_span(r, handle_tenant_create))
.post("/tenant/:tenant_id/timeline", |r| {
request_span(r, handle_tenant_timeline_create)
})
.get("/tenant/:tenant_id/locate", |r| {
request_span(r, handle_tenant_locate)
})
.put("/tenant/:tenant_shard_id/migrate", |r| {
request_span(r, handle_tenant_shard_migrate)
})
}

View File

@@ -7,7 +7,6 @@ mod node;
pub mod persistence;
mod reconciler;
mod scheduler;
mod schema;
pub mod service;
mod tenant_state;
@@ -18,8 +17,6 @@ enum PlacementPolicy {
/// Production-ready way to attach a tenant: one attached pageserver and
/// some number of secondaries.
Double(usize),
/// Do not attach to any pageservers
Detached,
}
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]

View File

@@ -12,9 +12,9 @@ use camino::Utf8PathBuf;
use clap::Parser;
use metrics::launch_timestamp::LaunchTimestamp;
use std::sync::Arc;
use tokio::signal::unix::SignalKind;
use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat};
use utils::signals::{ShutdownSignals, Signal};
use utils::{project_build_tag, project_git_version, tcp_listener};
@@ -39,11 +39,7 @@ struct Cli {
/// Path to the .json file to store state (will be created if it doesn't exist)
#[arg(short, long)]
path: Option<Utf8PathBuf>,
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
#[arg(long)]
database_url: String,
path: Utf8PathBuf,
}
#[tokio::main]
@@ -62,7 +58,7 @@ async fn main() -> anyhow::Result<()> {
GIT_VERSION,
launch_ts.to_string(),
BUILD_TAG,
args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
args.path,
args.listen
);
@@ -70,10 +66,9 @@ async fn main() -> anyhow::Result<()> {
jwt_token: args.jwt_token,
};
let json_path = args.path;
let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
let persistence = Arc::new(Persistence::new(&args.path).await);
let service = Service::spawn(config, persistence.clone()).await?;
let service = Service::spawn(config, persistence).await?;
let http_listener = tcp_listener::bind(args.listen)?;
@@ -86,31 +81,20 @@ async fn main() -> anyhow::Result<()> {
let router = make_router(service, auth)
.build()
.map_err(|err| anyhow!(err))?;
let router_service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
tracing::info!("Serving on {0}", args.listen);
tokio::task::spawn(server);
// Wait until we receive a signal
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
tokio::select! {
_ = sigint.recv() => {},
_ = sigterm.recv() => {},
_ = sigquit.recv() => {},
}
tracing::info!("Terminating on signal");
if json_path.is_some() {
// Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
// full postgres dumps around.
if let Err(e) = persistence.write_tenants_json().await {
tracing::error!("Failed to write JSON on shutdown: {e}")
ShutdownSignals::handle(|signal| match signal {
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
tracing::info!("Got {}. Terminating", signal.name());
// We're just a test helper: no graceful shutdown.
std::process::exit(0);
}
}
})?;
std::process::exit(0);
Ok(())
}

View File

@@ -1,8 +1,6 @@
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use utils::id::NodeId;
use crate::persistence::NodePersistence;
#[derive(Clone)]
pub(crate) struct Node {
pub(crate) id: NodeId,
@@ -36,15 +34,4 @@ impl Node {
NodeSchedulingPolicy::Pause => false,
}
}
pub(crate) fn to_persistent(&self) -> NodePersistence {
NodePersistence {
node_id: self.id.0 as i64,
scheduling_policy: self.scheduling.into(),
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port as i32,
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port as i32,
}
}
}

View File

@@ -1,295 +1,181 @@
use std::collections::HashMap;
use std::str::FromStr;
use std::{collections::HashMap, str::FromStr};
use camino::Utf8Path;
use camino::Utf8PathBuf;
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use diesel::pg::PgConnection;
use diesel::prelude::*;
use diesel::Connection;
use pageserver_api::models::TenantConfig;
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
use camino::{Utf8Path, Utf8PathBuf};
use control_plane::{
attachment_service::{NodeAvailability, NodeSchedulingPolicy},
local_env::LocalEnv,
};
use pageserver_api::{
models::TenantConfig,
shard::{ShardCount, ShardNumber, TenantShardId},
};
use postgres_connection::parse_host_port;
use serde::{Deserialize, Serialize};
use utils::generation::Generation;
use utils::id::{NodeId, TenantId};
use utils::{
generation::Generation,
id::{NodeId, TenantId},
};
use crate::node::Node;
use crate::PlacementPolicy;
use crate::{node::Node, PlacementPolicy};
/// ## What do we store?
///
/// The attachment service does not store most of its state durably.
///
/// The essential things to store durably are:
/// - generation numbers, as these must always advance monotonically to ensure data safety.
/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
/// - Node's scheduling policies, as the source of truth for these is something external.
///
/// Other things we store durably as an implementation detail:
/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
/// but it is operationally simpler to make this service the authority for which nodes
/// it talks to.
///
/// ## Performance/efficiency
///
/// The attachment service does not go via the database for most things: there are
/// a couple of places where we must, and where efficiency matters:
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
/// before it can attach a tenant, so this acts as a bound on how fast things like
/// failover can happen.
/// - Pageserver re-attach: we will increment many shards' generations when this happens,
/// so it is important to avoid e.g. issuing O(N) queries.
///
/// Database calls relating to nodes have low performance requirements, as they are very rarely
/// updated, and reads of nodes are always from memory, not the database. We only require that
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
/// Placeholder for storage. This will be replaced with a database client.
pub struct Persistence {
database_url: String,
// In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of
// test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
// compatible just yet.
json_path: Option<Utf8PathBuf>,
state: std::sync::Mutex<PersistentState>,
}
/// Legacy format, for use in JSON compat objects in test environment
// Top level state available to all HTTP handlers
#[derive(Serialize, Deserialize)]
struct JsonPersistence {
struct PersistentState {
tenants: HashMap<TenantShardId, TenantShardPersistence>,
#[serde(skip)]
path: Utf8PathBuf,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum DatabaseError {
#[error(transparent)]
Query(#[from] diesel::result::Error),
#[error(transparent)]
Connection(#[from] diesel::result::ConnectionError),
#[error("Logical error: {0}")]
Logical(String),
/// A convenience for serializing the state inside a sync lock, and then
/// writing it to disk outside of the lock. This will go away when switching
/// to a database backend.
struct PendingWrite {
bytes: Vec<u8>,
path: Utf8PathBuf,
}
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
impl PendingWrite {
async fn commit(&self) -> anyhow::Result<()> {
tokio::fs::write(&self.path, &self.bytes).await?;
impl Persistence {
pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
Self {
database_url,
json_path,
Ok(())
}
}
impl PersistentState {
fn save(&self) -> PendingWrite {
PendingWrite {
bytes: serde_json::to_vec(self).expect("Serialization error"),
path: self.path.clone(),
}
}
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
where
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
R: Send + 'static,
{
let database_url = self.database_url.clone();
tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
// TODO: connection pooling, such as via diesel::r2d2
let mut conn = PgConnection::establish(&database_url)?;
func(&mut conn)
})
.await
.expect("Task panic")
}
async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
let bytes = tokio::fs::read(path).await?;
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
decoded.path = path.to_owned();
/// When a node is first registered, persist it before using it for anything
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
let np = node.to_persistent();
self.with_conn(move |conn| -> DatabaseResult<()> {
diesel::insert_into(crate::schema::nodes::table)
.values(&np)
.execute(conn)?;
Ok(())
})
.await
}
/// At startup, populate the list of nodes which our shards may be placed on
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
let nodes: Vec<Node> = self
.with_conn(move |conn| -> DatabaseResult<_> {
Ok(crate::schema::nodes::table
.load::<NodePersistence>(conn)?
.into_iter()
.map(|n| Node {
id: NodeId(n.node_id as u64),
// At startup we consider a node offline until proven otherwise.
availability: NodeAvailability::Offline,
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
.expect("Bad scheduling policy in DB"),
listen_http_addr: n.listen_http_addr,
listen_http_port: n.listen_http_port as u16,
listen_pg_addr: n.listen_pg_addr,
listen_pg_port: n.listen_pg_port as u16,
})
.collect::<Vec<Node>>())
})
.await?;
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
Ok(nodes)
}
/// At startup, load the high level state for shards, such as their config + policy. This will
/// be enriched at runtime with state discovered on pageservers.
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
let loaded = self
.with_conn(move |conn| -> DatabaseResult<_> {
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
})
.await?;
if loaded.is_empty() {
if let Some(path) = &self.json_path {
if tokio::fs::try_exists(path)
.await
.map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
{
tracing::info!("Importing from legacy JSON format at {path}");
return self.list_tenant_shards_json(path).await;
}
}
}
Ok(loaded)
}
/// Shim for automated compatibility tests: load tenants from a JSON file instead of database
pub(crate) async fn list_tenant_shards_json(
&self,
path: &Utf8Path,
) -> DatabaseResult<Vec<TenantShardPersistence>> {
let bytes = tokio::fs::read(path)
.await
.map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
for (tenant_id, tenant) in &mut decoded.tenants {
// Backward compat: an old attachments.json from before PR #6251, replace
// empty strings with proper defaults.
if tenant.tenant_id.is_empty() {
tenant.tenant_id = tenant_id.to_string();
tenant.config = serde_json::to_string(&TenantConfig::default())
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
tenant.tenant_id = format!("{}", tenant_id);
tenant.config = serde_json::to_string(&TenantConfig::default())?;
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
}
}
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
// Synchronize database with what is in the JSON file
self.insert_tenant_shards(tenants.clone()).await?;
Ok(tenants)
Ok(decoded)
}
/// For use in testing environments, where we dump out JSON on shutdown.
pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
let Some(path) = &self.json_path else {
anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
};
tracing::info!("Writing state to {path}...");
let tenants = self.list_tenant_shards().await?;
let mut tenants_map = HashMap::new();
for tsp in tenants {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
tenants_map.insert(tenant_shard_id, tsp);
async fn load_or_new(path: &Utf8Path) -> Self {
match Self::load(path).await {
Ok(s) => {
tracing::info!("Loaded state file at {}", path);
s
}
Err(e)
if e.downcast_ref::<std::io::Error>()
.map(|e| e.kind() == std::io::ErrorKind::NotFound)
.unwrap_or(false) =>
{
tracing::info!("Will create state file at {}", path);
Self {
tenants: HashMap::new(),
path: path.to_owned(),
}
}
Err(e) => {
panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
}
}
let json = serde_json::to_string(&JsonPersistence {
tenants: tenants_map,
})?;
}
}
tokio::fs::write(path, &json).await?;
tracing::info!("Wrote {} bytes to {path}...", json.len());
impl Persistence {
pub async fn new(path: &Utf8Path) -> Self {
let state = PersistentState::load_or_new(path).await;
Self {
state: std::sync::Mutex::new(state),
}
}
/// When registering a node, persist it so that on next start we will be able to
/// iterate over known nodes to synchronize their tenant shard states with our observed state.
pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
// TODO: node persitence will come with database backend
Ok(())
}
/// At startup, we populate the service's list of nodes, and use this list to call into
/// each node to do an initial reconciliation of the state of the world with our in-memory
/// observed state.
pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
let env = LocalEnv::load_config()?;
// TODO: node persitence will come with database backend
// XXX hack: enable test_backward_compatibility to work by populating our list of
// nodes from LocalEnv when it is not present in persistent storage. Otherwise at
// first startup in the compat test, we may have shards but no nodes.
let mut result = Vec::new();
tracing::info!(
"Loaded {} pageserver nodes from LocalEnv",
env.pageservers.len()
);
for ps_conf in env.pageservers {
let (pg_host, pg_port) =
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
result.push(Node {
id: ps_conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
});
}
Ok(result)
}
/// At startup, we populate our map of tenant shards from persistent storage.
pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
let locked = self.state.lock().unwrap();
Ok(locked.tenants.values().cloned().collect())
}
/// Tenants must be persisted before we schedule them for the first time. This enables us
/// to correctly retain generation monotonicity, and the externally provided placement policy & config.
pub(crate) async fn insert_tenant_shards(
&self,
shards: Vec<TenantShardPersistence>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
conn.transaction(|conn| -> QueryResult<()> {
for tenant in &shards {
diesel::insert_into(tenant_shards)
.values(tenant)
.execute(conn)?;
}
Ok(())
})?;
Ok(())
})
.await
}
) -> anyhow::Result<()> {
let write = {
let mut locked = self.state.lock().unwrap();
for shard in shards {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
shard_number: ShardNumber(shard.shard_number as u8),
shard_count: ShardCount(shard.shard_count as u8),
};
/// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
/// the tenant from memory on this server.
#[allow(unused)]
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
diesel::delete(tenant_shards)
.filter(tenant_id.eq(del_tenant_id.to_string()))
.execute(conn)?;
locked.tenants.insert(tenant_shard_id, shard);
}
locked.save()
};
Ok(())
})
.await
}
write.commit().await?;
/// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
/// batched increment of the generations of all tenants whose generation_pageserver is equal to
/// the node that called /re-attach.
#[tracing::instrument(skip_all, fields(node_id))]
pub(crate) async fn re_attach(
&self,
node_id: NodeId,
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
use crate::schema::tenant_shards::dsl::*;
let updated = self
.with_conn(move |conn| {
let rows_updated = diesel::update(tenant_shards)
.filter(generation_pageserver.eq(node_id.0 as i64))
.set(generation.eq(generation + 1))
.execute(conn)?;
tracing::info!("Incremented {} tenants' generations", rows_updated);
// TODO: UPDATE+SELECT in one query
let updated = tenant_shards
.filter(generation_pageserver.eq(node_id.0 as i64))
.select(TenantShardPersistence::as_select())
.load(conn)?;
Ok(updated)
})
.await?;
let mut result = HashMap::new();
for tsp in updated {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
}
Ok(result)
Ok(())
}
/// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
@@ -298,48 +184,49 @@ impl Persistence {
pub(crate) async fn increment_generation(
&self,
tenant_shard_id: TenantShardId,
node_id: NodeId,
node_id: Option<NodeId>,
) -> anyhow::Result<Generation> {
use crate::schema::tenant_shards::dsl::*;
let updated = self
.with_conn(move |conn| {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
.set((
generation.eq(generation + 1),
generation_pageserver.eq(node_id.0 as i64),
))
// TODO: only returning() the generation column
.returning(TenantShardPersistence::as_returning())
.get_result(conn)?;
let (write, gen) = {
let mut locked = self.state.lock().unwrap();
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
anyhow::bail!("Tried to increment generation of unknown shard");
};
Ok(updated)
})
.await?;
// If we're called with a None pageserver, we need only update the generation
// record to disassociate it with this pageserver, not actually increment the number, as
// the increment is guaranteed to happen the next time this tenant is attached.
if node_id.is_some() {
shard.generation += 1;
}
Ok(Generation::new(updated.generation as u32))
shard.generation_pageserver = node_id;
let gen = Generation::new(shard.generation);
(locked.save(), gen)
};
write.commit().await?;
Ok(gen)
}
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
.set((
generation_pageserver.eq(i64::MAX),
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
))
.execute(conn)?;
pub(crate) async fn re_attach(
&self,
node_id: NodeId,
) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
let (write, result) = {
let mut result = HashMap::new();
let mut locked = self.state.lock().unwrap();
for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
if shard.generation_pageserver == Some(node_id) {
shard.generation += 1;
result.insert(*tenant_shard_id, Generation::new(shard.generation));
}
}
Ok(updated)
})
.await?;
(locked.save(), result)
};
Ok(())
write.commit().await?;
Ok(result)
}
// TODO: when we start shard splitting, we must durably mark the tenant so that
@@ -359,8 +246,7 @@ impl Persistence {
}
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
#[diesel(table_name = crate::schema::tenant_shards)]
#[derive(Serialize, Deserialize, Clone)]
pub(crate) struct TenantShardPersistence {
#[serde(default)]
pub(crate) tenant_id: String,
@@ -371,28 +257,16 @@ pub(crate) struct TenantShardPersistence {
#[serde(default)]
pub(crate) shard_stripe_size: i32,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching
pub(crate) generation: i32,
// Currently attached pageserver
#[serde(rename = "pageserver")]
pub(crate) generation_pageserver: i64,
pub(crate) generation_pageserver: Option<NodeId>,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching
pub(crate) generation: u32,
#[serde(default)]
pub(crate) placement_policy: String,
#[serde(default)]
pub(crate) config: String,
}
/// Parts of [`crate::node::Node`] that are stored durably
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
#[diesel(table_name = crate::schema::nodes)]
pub(crate) struct NodePersistence {
pub(crate) node_id: i64,
pub(crate) scheduling_policy: String,
pub(crate) listen_http_addr: String,
pub(crate) listen_http_port: i32,
pub(crate) listen_pg_addr: String,
pub(crate) listen_pg_port: i32,
}

View File

@@ -296,7 +296,7 @@ impl Reconciler {
// Increment generation before attaching to new pageserver
self.generation = self
.persistence
.increment_generation(self.tenant_shard_id, dest_ps_id)
.increment_generation(self.tenant_shard_id, Some(dest_ps_id))
.await?;
let dest_conf = build_location_config(
@@ -395,7 +395,7 @@ impl Reconciler {
// as locations with unknown (None) observed state.
self.generation = self
.persistence
.increment_generation(self.tenant_shard_id, node_id)
.increment_generation(self.tenant_shard_id, Some(node_id))
.await?;
wanted_conf.generation = self.generation.into();
tracing::info!("Observed configuration requires update.");

View File

@@ -1,27 +0,0 @@
// @generated automatically by Diesel CLI.
diesel::table! {
nodes (node_id) {
node_id -> Int8,
scheduling_policy -> Varchar,
listen_http_addr -> Varchar,
listen_http_port -> Int4,
listen_pg_addr -> Varchar,
listen_pg_port -> Int4,
}
}
diesel::table! {
tenant_shards (tenant_id, shard_number, shard_count) {
tenant_id -> Varchar,
shard_number -> Int4,
shard_count -> Int4,
shard_stripe_size -> Int4,
generation -> Int4,
generation_pageserver -> Int8,
placement_policy -> Varchar,
config -> Text,
}
}
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);

View File

@@ -11,7 +11,6 @@ use control_plane::attachment_service::{
TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
TenantShardMigrateRequest, TenantShardMigrateResponse,
};
use diesel::result::DatabaseErrorKind;
use hyper::StatusCode;
use pageserver_api::{
control_api::{
@@ -21,24 +20,22 @@ use pageserver_api::{
models,
models::{
LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
TimelineCreateRequest, TimelineInfo,
},
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
};
use pageserver_client::mgmt_api;
use utils::{
completion::Barrier,
generation::Generation,
http::error::ApiError,
id::{NodeId, TenantId, TimelineId},
id::{NodeId, TenantId},
seqwait::SeqWait,
};
use crate::{
compute_hook::ComputeHook,
node::Node,
persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
persistence::{Persistence, TenantShardPersistence},
scheduler::Scheduler,
tenant_state::{
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -49,10 +46,6 @@ use crate::{
const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
/// How long [`Service::startup_reconcile`] is allowed to take before it should give
/// up on unresponsive pageservers and proceed.
pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
// Top level state available to all HTTP handlers
struct ServiceState {
tenants: BTreeMap<TenantShardId, TenantState>,
@@ -86,27 +79,10 @@ pub struct Config {
pub jwt_token: Option<String>,
}
impl From<DatabaseError> for ApiError {
fn from(err: DatabaseError) -> ApiError {
match err {
DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
// FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
DatabaseError::Connection(_e) => ApiError::ShuttingDown,
DatabaseError::Logical(reason) => {
ApiError::InternalServerError(anyhow::anyhow!(reason))
}
}
}
}
pub struct Service {
inner: Arc<std::sync::RwLock<ServiceState>>,
config: Config,
persistence: Arc<Persistence>,
/// This waits for initial reconciliation with pageservers to complete. Until this barrier
/// passes, it isn't safe to do any actions that mutate tenants.
pub(crate) startup_complete: Barrier,
}
impl From<ReconcileWaitError> for ApiError {
@@ -120,32 +96,77 @@ impl From<ReconcileWaitError> for ApiError {
}
impl Service {
pub fn get_config(&self) -> &Config {
&self.config
}
pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
tracing::info!("Loading nodes from database...");
let mut nodes = persistence.list_nodes().await?;
tracing::info!("Loaded {} nodes from database.", nodes.len());
tracing::info!("Loading shards from database...");
let tenant_shard_persistence = persistence.list_tenant_shards().await?;
tracing::info!(
"Loaded {} shards from database.",
tenant_shard_persistence.len()
);
let mut tenants = BTreeMap::new();
for tsp in tenant_shard_persistence {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
let shard_identity = if tsp.shard_count == 0 {
ShardIdentity::unsharded()
} else {
ShardIdentity::new(
ShardNumber(tsp.shard_number as u8),
ShardCount(tsp.shard_count as u8),
ShardStripeSize(tsp.shard_stripe_size as u32),
)?
};
let new_tenant = TenantState {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
// Note that we load generation, but don't care about generation_pageserver. We will either end up finding
// our existing attached location and it will match generation_pageserver, or we will attach somewhere new
// and update generation_pageserver in the process.
generation: Generation::new(tsp.generation),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent: IntentState::new(),
observed: ObservedState::new(),
config: serde_json::from_str(&tsp.config).unwrap(),
reconciler: None,
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
};
tenants.insert(tenant_shard_id, new_tenant);
}
/// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
/// until this is done.
async fn startup_reconcile(&self) {
// For all tenant shards, a vector of observed states on nodes (where None means
// indeterminate, same as in [`ObservedStateLocation`])
let mut observed = HashMap::new();
let nodes = {
let locked = self.inner.read().unwrap();
locked.nodes.clone()
};
// TODO: issue these requests concurrently
for node in nodes.values() {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
for node in &mut nodes {
let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
tracing::info!("Scanning shards on node {}...", node.id);
match client.list_location_config().await {
Err(e) => {
tracing::warn!("Could not contact pageserver {} ({e})", node.id);
// TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
// pageserver is being restarted at the same time as we are
// TODO: be more tolerant, apply a generous 5-10 second timeout
// TODO: setting a node to Offline is a dramatic thing to do, and can
// prevent neon_local from starting up (it starts this service before
// any pageservers are running). It may make sense to give nodes
// a Pending state to accomodate this situation, and allow (but deprioritize)
// scheduling on Pending nodes.
//node.availability = NodeAvailability::Offline;
}
Ok(listing) => {
tracing::info!(
@@ -153,6 +174,7 @@ impl Service {
listing.tenant_shards.len(),
node.id
);
node.availability = NodeAvailability::Active;
for (tenant_shard_id, conf_opt) in listing.tenant_shards {
observed.insert(tenant_shard_id, (node.id, conf_opt));
@@ -164,46 +186,41 @@ impl Service {
let mut cleanup = Vec::new();
// Populate intent and observed states for all tenants, based on reported state on pageservers
let shard_count = {
let mut locked = self.inner.write().unwrap();
for (tenant_shard_id, (node_id, observed_loc)) in observed {
let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
cleanup.push((tenant_shard_id, node_id));
continue;
};
for (tenant_shard_id, (node_id, observed_loc)) in observed {
let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
cleanup.push((tenant_shard_id, node_id));
continue;
};
tenant_state
.observed
.locations
.insert(node_id, ObservedStateLocation { conf: observed_loc });
tenant_state
.observed
.locations
.insert(node_id, ObservedStateLocation { conf: observed_loc });
}
// State of nodes is now frozen, transform to a HashMap.
let mut nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
// Populate each tenant's intent state
let mut scheduler = Scheduler::new(&tenants, &nodes);
for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
tenant_state.intent_from_observed();
if let Err(e) = tenant_state.schedule(&mut scheduler) {
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
// not enough pageservers are available. The tenant may well still be available
// to clients.
tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
}
// Populate each tenant's intent state
let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
tenant_state.intent_from_observed();
if let Err(e) = tenant_state.schedule(&mut scheduler) {
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
// not enough pageservers are available. The tenant may well still be available
// to clients.
tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
}
}
locked.tenants.len()
};
// TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
// generation_pageserver in the database.
}
// Clean up any tenants that were found on pageservers but are not known to us.
for (tenant_shard_id, node_id) in cleanup {
// A node reported a tenant_shard_id which is unknown to us: detach it.
let node = nodes
.get(&node_id)
.get_mut(&node_id)
.expect("Always exists: only known nodes are scanned");
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
match client
.location_config(
tenant_shard_id,
@@ -235,80 +252,13 @@ impl Service {
}
}
// Finally, now that the service is up and running, launch reconcile operations for any tenants
// which require it: under normal circumstances this should only include tenants that were in some
// transient state before we restarted.
let reconcile_tasks = self.reconcile_all();
tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
}
pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
tracing::info!("Loading nodes from database...");
let nodes = persistence.list_nodes().await?;
let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
tracing::info!("Loaded {} nodes from database.", nodes.len());
tracing::info!("Loading shards from database...");
let tenant_shard_persistence = persistence.list_tenant_shards().await?;
tracing::info!(
"Loaded {} shards from database.",
tenant_shard_persistence.len()
);
let mut tenants = BTreeMap::new();
for tsp in tenant_shard_persistence {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
let shard_identity = if tsp.shard_count == 0 {
ShardIdentity::unsharded()
} else {
ShardIdentity::new(
ShardNumber(tsp.shard_number as u8),
ShardCount(tsp.shard_count as u8),
ShardStripeSize(tsp.shard_stripe_size as u32),
)?
};
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
// it with what we can infer: the node for which a generation was most recently issued.
let mut intent = IntentState::new();
if tsp.generation_pageserver != i64::MAX {
intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
}
let new_tenant = TenantState {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
generation: Generation::new(tsp.generation as u32),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent,
observed: ObservedState::new(),
config: serde_json::from_str(&tsp.config).unwrap(),
reconciler: None,
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
};
tenants.insert(tenant_shard_id, new_tenant);
}
let (startup_completion, startup_complete) = utils::completion::channel();
let shard_count = tenants.len();
let this = Arc::new(Self {
inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
result_tx, nodes, tenants,
))),
config,
persistence,
startup_complete,
});
let result_task_this = this.clone();
@@ -366,13 +316,11 @@ impl Service {
}
});
let startup_reconcile_this = this.clone();
tokio::task::spawn(async move {
// Block the [`Service::startup_complete`] barrier until we're done
let _completion = startup_completion;
startup_reconcile_this.startup_reconcile().await
});
// Finally, now that the service is up and running, launch reconcile operations for any tenants
// which require it: under normal circumstances this should only include tenants that were in some
// transient state before we restarted.
let reconcile_tasks = this.reconcile_all();
tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
Ok(this)
}
@@ -388,6 +336,7 @@ impl Service {
let locked = self.inner.write().unwrap();
!locked.tenants.contains_key(&attach_req.tenant_shard_id)
};
if insert {
let tsp = TenantShardPersistence {
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
@@ -395,49 +344,31 @@ impl Service {
shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
shard_stripe_size: 0,
generation: 0,
generation_pageserver: i64::MAX,
generation_pageserver: None,
placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
};
match self.persistence.insert_tenant_shards(vec![tsp]).await {
Err(e) => match e {
DatabaseError::Query(diesel::result::Error::DatabaseError(
DatabaseErrorKind::UniqueViolation,
_,
)) => {
tracing::info!(
"Raced with another request to insert tenant {}",
attach_req.tenant_shard_id
)
}
_ => return Err(e.into()),
},
Ok(()) => {
tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
self.persistence.insert_tenant_shards(vec![tsp]).await?;
let mut locked = self.inner.write().unwrap();
locked.tenants.insert(
attach_req.tenant_shard_id,
TenantState::new(
attach_req.tenant_shard_id,
ShardIdentity::unsharded(),
PlacementPolicy::Single,
),
);
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
}
}
let mut locked = self.inner.write().unwrap();
locked.tenants.insert(
attach_req.tenant_shard_id,
TenantState::new(
attach_req.tenant_shard_id,
ShardIdentity::unsharded(),
PlacementPolicy::Single,
),
);
}
let new_generation = if let Some(req_node_id) = attach_req.node_id {
let new_generation = if attach_req.node_id.is_some() {
Some(
self.persistence
.increment_generation(attach_req.tenant_shard_id, req_node_id)
.increment_generation(attach_req.tenant_shard_id, attach_req.node_id)
.await?,
)
} else {
self.persistence.detach(attach_req.tenant_shard_id).await?;
None
};
@@ -449,11 +380,6 @@ impl Service {
if let Some(new_generation) = new_generation {
tenant_state.generation = new_generation;
} else {
// This is a detach notification. We must update placement policy to avoid re-attaching
// during background scheduling/reconciliation, or during attachment service restart.
assert!(attach_req.node_id.is_none());
tenant_state.policy = PlacementPolicy::Detached;
}
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
@@ -481,7 +407,6 @@ impl Service {
"attach_hook: tenant {} set generation {:?}, pageserver {}",
attach_req.tenant_shard_id,
tenant_state.generation,
// TODO: this is an odd number of 0xf's
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
);
@@ -574,14 +499,6 @@ impl Service {
id: req_tenant.id,
valid,
});
} else {
// After tenant deletion, we may approve any validation. This avoids
// spurious warnings on the pageserver if it has pending LSN updates
// at the point a deletion happens.
response.tenants.push(ValidateResponseTenant {
id: req_tenant.id,
valid: true,
});
}
}
response
@@ -636,8 +553,8 @@ impl Service {
shard_number: tenant_shard_id.shard_number.0 as i32,
shard_count: tenant_shard_id.shard_count.0 as i32,
shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
generation_pageserver: i64::MAX,
generation: 0,
generation_pageserver: None,
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
config: serde_json::to_string(&create_req.config).unwrap(),
})
@@ -678,7 +595,6 @@ impl Service {
})?;
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: entry
.get()
.intent
@@ -711,7 +627,6 @@ impl Service {
})?;
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: state
.intent
.attached
@@ -745,257 +660,14 @@ impl Service {
(waiters, response_shards)
};
self.await_waiters(waiters).await?;
Ok(TenantCreateResponse {
shards: response_shards,
})
}
/// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
/// wait for reconciliation to complete before responding.
async fn await_waiters(
&self,
waiters: Vec<ReconcilerWaiter>,
) -> Result<(), ReconcileWaitError> {
let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
for waiter in waiters {
let timeout = deadline.duration_since(Instant::now());
waiter.wait_timeout(timeout).await?;
}
Ok(())
}
/// This API is used by the cloud control plane to do coarse-grained control of tenants:
/// - Call with mode Attached* to upsert the tenant.
/// - Call with mode Detached to switch to PolicyMode::Detached
///
/// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
/// secondary locations.
pub(crate) async fn tenant_location_config(
&self,
tenant_id: TenantId,
req: TenantLocationConfigRequest,
) -> Result<TenantLocationConfigResponse, ApiError> {
if req.tenant_id.shard_count.0 > 1 {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"This API is for importing single-sharded or unsharded tenants"
)));
}
let mut waiters = Vec::new();
let mut result = TenantLocationConfigResponse { shards: Vec::new() };
let maybe_create = {
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let pageservers = locked.nodes.clone();
let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
// Maybe we have existing shards
let mut create = true;
for (shard_id, shard) in locked
.tenants
.range_mut(TenantShardId::tenant_range(tenant_id))
{
// Saw an existing shard: this is not a creation
create = false;
// Note that for existing tenants we do _not_ respect the generation in the request: this is likely
// to be stale. Once a tenant is created in this service, our view of generation is authoritative, and
// callers' generations may be ignored. This represents a one-way migration of tenants from the outer
// cloud control plane into this service.
// Use location config mode as an indicator of policy: if they ask for
// attached we go to default HA attached mode. If they ask for secondary
// we go to secondary-only mode. If they ask for detached we detach.
match req.config.mode {
LocationConfigMode::Detached => {
shard.policy = PlacementPolicy::Detached;
}
LocationConfigMode::Secondary => {
// TODO: implement secondary-only mode.
todo!();
}
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// TODO: persistence for changes in policy
if pageservers.len() > 1 {
shard.policy = PlacementPolicy::Double(1)
} else {
// Convenience for dev/test: if we just have one pageserver, import
// tenants into Single mode so that scheduling will succeed.
shard.policy = PlacementPolicy::Single
}
}
}
shard.schedule(&mut scheduler)?;
let maybe_waiter = shard.maybe_reconcile(
result_tx.clone(),
&pageservers,
&compute_hook,
&self.config,
&self.persistence,
);
if let Some(waiter) = maybe_waiter {
waiters.push(waiter);
}
if let Some(node_id) = shard.intent.attached {
result.shards.push(TenantShardLocation {
shard_id: *shard_id,
node_id,
})
}
}
if create {
// Validate request mode
match req.config.mode {
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
// When using this API to onboard an existing tenant to this service, it must start in
// an attached state, because we need the request to come with a generation
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Imported tenant must be in attached mode"
)));
}
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// Pass
}
}
// Validate request generation
let Some(generation) = req.config.generation else {
// We can only import attached tenants, because we need the request to come with a generation
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Generation is mandatory when importing tenant"
)));
};
// Synthesize a creation request
Some(TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: Some(generation),
shard_parameters: ShardParameters {
// Must preserve the incoming shard_count do distinguish unsharded (0)
// from single-sharded (1): this distinction appears in the S3 keys of the tenant.
count: req.tenant_id.shard_count,
// We only import un-sharded or single-sharded tenants, so stripe
// size can be made up arbitrarily here.
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
},
config: req.config.tenant_conf,
})
} else {
None
}
};
if let Some(create_req) = maybe_create {
let create_resp = self.tenant_create(create_req).await?;
result.shards = create_resp
.shards
.into_iter()
.map(|s| TenantShardLocation {
node_id: s.node_id,
shard_id: s.shard_id,
})
.collect();
} else {
// This was an update, wait for reconciliation
self.await_waiters(waiters).await?;
}
Ok(result)
}
pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
// TODO: refactor into helper
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
for (tenant_shard_id, shard) in
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
let node_id = shard.intent.attached.ok_or_else(|| {
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
})?;
let node = locked
.nodes
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
targets.push((*tenant_shard_id, node.clone()));
}
targets
};
// TODO: error out if the tenant is not attached anywhere.
// Phase 1: delete on the pageservers
let mut any_pending = false;
for (tenant_shard_id, node) in targets {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
// TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
// surface immediately as an error to our caller.
let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
ApiError::InternalServerError(anyhow::anyhow!(
"Error deleting shard {tenant_shard_id} on node {}: {e}",
node.id
))
})?;
tracing::info!(
"Shard {tenant_shard_id} on node {}, delete returned {}",
node.id,
status
);
if status == StatusCode::ACCEPTED {
any_pending = true;
}
}
if any_pending {
// Caller should call us again later. When we eventually see 404s from
// all the shards, we may proceed to delete our records of the tenant.
tracing::info!(
"Tenant {} has some shards pending deletion, returning 202",
tenant_id
);
return Ok(StatusCode::ACCEPTED);
}
// Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
// our in-memory state and database state.
// Ordering: we delete persistent state first: if we then
// crash, we will drop the in-memory state.
// Drop persistent state.
self.persistence.delete_tenant(tenant_id).await?;
// Drop in-memory state
{
let mut locked = self.inner.write().unwrap();
locked
.tenants
.retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
tracing::info!(
"Deleted tenant {tenant_id}, now have {} tenants",
locked.tenants.len()
);
};
// Success is represented as 404, to imitate the existing pageserver deletion API
Ok(StatusCode::NOT_FOUND)
Ok(TenantCreateResponse {
shards: response_shards,
})
}
pub(crate) async fn tenant_timeline_create(
@@ -1005,15 +677,25 @@ impl Service {
) -> Result<TimelineInfo, ApiError> {
let mut timeline_info = None;
tracing::info!(
"Creating timeline {}/{}",
tenant_id,
create_req.new_timeline_id,
);
let ensure_waiters = {
let locked = self.inner.write().unwrap();
tracing::info!(
"Creating timeline {}/{}, have {} pageservers",
tenant_id,
create_req.new_timeline_id,
locked.nodes.len()
);
self.ensure_attached_wait(tenant_id).await?;
self.ensure_attached(locked, tenant_id)
.map_err(ApiError::InternalServerError)?
};
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
for waiter in ensure_waiters {
let timeout = deadline.duration_since(Instant::now());
waiter.wait_timeout(timeout).await?;
}
// TODO: refuse to do this if shard splitting is in progress
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
@@ -1084,111 +766,6 @@ impl Service {
Ok(timeline_info.expect("targets cannot be empty"))
}
pub(crate) async fn tenant_timeline_delete(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<StatusCode, ApiError> {
tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
self.ensure_attached_wait(tenant_id).await?;
// TODO: refuse to do this if shard splitting is in progress
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
for (tenant_shard_id, shard) in
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
let node_id = shard.intent.attached.ok_or_else(|| {
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
})?;
let node = locked
.nodes
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
targets.push((*tenant_shard_id, node.clone()));
}
targets
};
if targets.is_empty() {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant not found").into(),
));
}
// TODO: call into shards concurrently
let mut any_pending = false;
for (tenant_shard_id, node) in targets {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
tracing::info!(
"Deleting timeline on shard {}/{}, attached to node {}",
tenant_shard_id,
timeline_id,
node.id
);
let status = client
.timeline_delete(tenant_shard_id, timeline_id)
.await
.map_err(|e| {
ApiError::InternalServerError(anyhow::anyhow!(
"Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
node.id
))
})?;
if status == StatusCode::ACCEPTED {
any_pending = true;
}
}
if any_pending {
Ok(StatusCode::ACCEPTED)
} else {
Ok(StatusCode::NOT_FOUND)
}
}
/// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
/// function looks it up and returns the url. If the tenant isn't found, returns Err(ApiError::NotFound)
pub(crate) fn tenant_shard0_baseurl(
&self,
tenant_id: TenantId,
) -> Result<(String, TenantShardId), ApiError> {
let locked = self.inner.read().unwrap();
let Some((tenant_shard_id, shard)) = locked
.tenants
.range(TenantShardId::tenant_range(tenant_id))
.next()
else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
));
};
// TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
// point to somewhere we haven't attached yet.
let Some(node_id) = shard.intent.attached else {
return Err(ApiError::Conflict(
"Cannot call timeline API on non-attached tenant".to_string(),
));
};
let Some(node) = locked.nodes.get(&node_id) else {
// This should never happen
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Shard refers to nonexistent node"
)));
};
Ok((node.base_url(), *tenant_shard_id))
}
pub(crate) fn tenant_locate(
&self,
tenant_id: TenantId,
@@ -1291,6 +868,7 @@ impl Service {
} else {
let old_attached = shard.intent.attached;
shard.intent.attached = Some(migrate_req.node_id);
match shard.policy {
PlacementPolicy::Single => {
shard.intent.secondary.clear();
@@ -1304,13 +882,7 @@ impl Service {
shard.intent.secondary.push(old_attached);
}
}
PlacementPolicy::Detached => {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
)))
}
}
shard.intent.attached = Some(migrate_req.node_id);
tracing::info!("Migrating: new intent {:?}", shard.intent);
shard.sequence = shard.sequence.next();
@@ -1334,20 +906,6 @@ impl Service {
Ok(TenantShardMigrateResponse {})
}
pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
// It is convenient to avoid taking the big lock and converting Node to a serializable
// structure, by fetching from storage instead of reading in-memory state.
let nodes = self
.persistence
.list_nodes()
.await?
.into_iter()
.map(|n| n.to_persistent())
.collect();
Ok(nodes)
}
pub(crate) async fn node_register(
&self,
register_req: NodeRegisterRequest,
@@ -1397,7 +955,10 @@ impl Service {
availability: NodeAvailability::Active,
};
// TODO: idempotency if the node already exists in the database
self.persistence.insert_node(&new_node).await?;
self.persistence
.insert_node(&new_node)
.await
.map_err(ApiError::InternalServerError)?;
let mut locked = self.inner.write().unwrap();
let mut new_nodes = (*locked.nodes).clone();
@@ -1521,7 +1082,7 @@ impl Service {
/// Helper for methods that will try and call pageserver APIs for
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
/// is attached somewhere.
fn ensure_attached_schedule(
fn ensure_attached(
&self,
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
tenant_id: TenantId,
@@ -1551,23 +1112,6 @@ impl Service {
Ok(waiters)
}
async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> {
let ensure_waiters = {
let locked = self.inner.write().unwrap();
self.ensure_attached_schedule(locked, tenant_id)
.map_err(ApiError::InternalServerError)?
};
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
for waiter in ensure_waiters {
let timeout = deadline.duration_since(Instant::now());
waiter.wait_timeout(timeout).await?;
}
Ok(())
}
/// Check all tenants for pending reconciliation work, and reconcile those in need
///
/// Returns how many reconciliation tasks were started

View File

@@ -312,18 +312,6 @@ impl TenantState {
modified = true;
}
}
Detached => {
// Should have no attached or secondary pageservers
if self.intent.attached.is_some() {
self.intent.attached = None;
modified = true;
}
if !self.intent.secondary.is_empty() {
self.intent.secondary.clear();
modified = true;
}
}
}
if modified {

View File

@@ -1,11 +1,5 @@
use crate::{background_process, local_env::LocalEnv};
use camino::{Utf8Path, Utf8PathBuf};
use diesel::{
backend::Backend,
query_builder::{AstPass, QueryFragment, QueryId},
Connection, PgConnection, QueryResult, RunQueryDsl,
};
use diesel_migrations::{HarnessWithOutput, MigrationHarness};
use camino::Utf8PathBuf;
use hyper::Method;
use pageserver_api::{
models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
@@ -13,11 +7,10 @@ use pageserver_api::{
};
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{env, str::FromStr};
use tokio::process::Command;
use std::{path::PathBuf, process::Child, str::FromStr};
use tracing::instrument;
use url::Url;
use utils::{
auth::{Claims, Scope},
id::{NodeId, TenantId},
@@ -26,17 +19,14 @@ use utils::{
pub struct AttachmentService {
env: LocalEnv,
listen: String,
path: Utf8PathBuf,
path: PathBuf,
jwt_token: Option<String>,
public_key_path: Option<Utf8PathBuf>,
postgres_port: u16,
client: reqwest::Client,
}
const COMMAND: &str = "attachment_service";
const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
@@ -60,7 +50,6 @@ pub struct InspectResponse {
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub generation: u32,
}
@@ -180,9 +169,7 @@ pub struct TenantShardMigrateResponse {}
impl AttachmentService {
pub fn from_env(env: &LocalEnv) -> Self {
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
.unwrap()
.join("attachments.json");
let path = env.base_data_dir.join("attachments.json");
// Makes no sense to construct this if pageservers aren't going to use it: assume
// pageservers have control plane API set
@@ -194,13 +181,6 @@ impl AttachmentService {
listen_url.port().unwrap()
);
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
// port, for use by our captive postgres.
let postgres_port = listen_url
.port()
.expect("Control plane API setting should always have a port")
+ 1;
// Assume all pageservers have symmetric auth configuration: this service
// expects to use one JWT token to talk to all of them.
let ps_conf = env
@@ -229,7 +209,6 @@ impl AttachmentService {
listen,
jwt_token,
public_key_path,
postgres_port,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
@@ -241,214 +220,13 @@ impl AttachmentService {
.expect("non-Unicode path")
}
/// PIDFile for the postgres instance used to store attachment service state
fn postgres_pid_file(&self) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(
self.env
.base_data_dir
.join("attachment_service_postgres.pid"),
)
.expect("non-Unicode path")
}
pub async fn start(&self) -> anyhow::Result<Child> {
let path_str = self.path.to_string_lossy();
/// In order to access database migrations, we need to find the Neon source tree
async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
// We assume that either prd or our binary is in the source tree. The former is usually
// true for automated test runners, the latter is usually true for developer workstations. Often
// both are true, which is fine.
let candidate_start_points = [
// Current working directory
Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
// Directory containing the binary we're running inside
Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
];
// For each candidate start point, search through ancestors looking for a neon.git source tree root
for start_point in &candidate_start_points {
// Start from the build dir: assumes we are running out of a built neon source tree
for path in start_point.ancestors() {
// A crude approximation: the root of the source tree is whatever contains a "control_plane"
// subdirectory.
let control_plane = path.join("control_plane");
if tokio::fs::try_exists(&control_plane).await? {
return Ok(path.to_owned());
}
}
}
// Fall-through
Err(anyhow::anyhow!(
"Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
))
}
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
///
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
/// to other versions if that one isn't found. Some automated tests create circumstances
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
for v in prefer_versions {
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
if tokio::fs::try_exists(&path).await? {
return Ok(path);
}
}
// Fall through
anyhow::bail!(
"Postgres binaries not found in {}",
self.env.pg_distrib_dir.display()
);
}
/// Readiness check for our postgres process
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
let bin_path = pg_bin_dir.join("pg_isready");
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
Ok(exitcode.success())
}
/// Create our database if it doesn't exist, and run migrations.
///
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
/// who just want to run `cargo neon_local` without knowing about diesel.
///
/// Returns the database url
pub async fn setup_database(&self) -> anyhow::Result<String> {
let database_url = format!(
"postgresql://localhost:{}/attachment_service",
self.postgres_port
);
println!("Running attachment service database setup...");
fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
let base = ::url::Url::parse(database_url).unwrap();
let database = base.path_segments().unwrap().last().unwrap().to_owned();
let mut new_url = base.join(default_database).unwrap();
new_url.set_query(base.query());
(database, new_url.into())
}
#[derive(Debug, Clone)]
pub struct CreateDatabaseStatement {
db_name: String,
}
impl CreateDatabaseStatement {
pub fn new(db_name: &str) -> Self {
CreateDatabaseStatement {
db_name: db_name.to_owned(),
}
}
}
impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
out.push_sql("CREATE DATABASE ");
out.push_identifier(&self.db_name)?;
Ok(())
}
}
impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
impl QueryId for CreateDatabaseStatement {
type QueryId = ();
const HAS_STATIC_QUERY_ID: bool = false;
}
if PgConnection::establish(&database_url).is_err() {
let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
println!("Creating database: {database}");
let mut conn = PgConnection::establish(&postgres_url)?;
CreateDatabaseStatement::new(&database).execute(&mut conn)?;
}
let mut conn = PgConnection::establish(&database_url)?;
let migrations_dir = self
.find_source_root()
.await?
.join("control_plane/attachment_service/migrations");
let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
println!("Running migrations in {}", migrations.path().display());
HarnessWithOutput::write_to_stdout(&mut conn)
.run_pending_migrations(migrations)
.map(|_| ())
.map_err(|e| anyhow::anyhow!(e))?;
println!("Migrations complete");
Ok(database_url)
}
pub async fn start(&self) -> anyhow::Result<()> {
// Start a vanilla Postgres process used by the attachment service for persistence.
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
.unwrap()
.join("attachment_service_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_log_path = pg_data_path.join("postgres.log");
if !tokio::fs::try_exists(&pg_data_path).await? {
// Initialize empty database
let initdb_path = pg_bin_dir.join("initdb");
let mut child = Command::new(&initdb_path)
.args(["-D", pg_data_path.as_ref()])
.spawn()
.expect("Failed to spawn initdb");
let status = child.wait().await?;
if !status.success() {
anyhow::bail!("initdb failed with status {status}");
}
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}", self.postgres_port),
)
.await?;
};
println!("Starting attachment service database...");
let db_start_args = [
"-w",
"-D",
pg_data_path.as_ref(),
"-l",
pg_log_path.as_ref(),
"start",
];
background_process::start_process(
"attachment_service_db",
&self.env.base_data_dir,
pg_bin_dir.join("pg_ctl").as_std_path(),
db_start_args,
[],
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|| self.pg_isready(&pg_bin_dir),
)
.await?;
// Run migrations on every startup, in case something changed.
let database_url = self.setup_database().await?;
let mut args = vec![
"-l",
&self.listen,
"-p",
self.path.as_ref(),
"--database-url",
&database_url,
]
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
let mut args = vec!["-l", &self.listen, "-p", &path_str]
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
if let Some(jwt_token) = &self.jwt_token {
args.push(format!("--jwt-token={jwt_token}"));
}
@@ -457,7 +235,7 @@ impl AttachmentService {
args.push(format!("--public-key={public_key_path}"));
}
background_process::start_process(
let result = background_process::start_process(
COMMAND,
&self.env.base_data_dir,
&self.env.attachment_service_bin(),
@@ -474,46 +252,29 @@ impl AttachmentService {
}
},
)
.await?;
.await;
Ok(())
}
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
println!("Stopping attachment service database...");
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_stop_args)
.spawn()?
.wait()
for ps_conf in &self.env.pageservers {
let (pg_host, pg_port) =
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
self.node_register(NodeRegisterRequest {
node_id: ps_conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
if !stop_status.success() {
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_status_args)
.spawn()?
.wait()
.await?;
// pg_ctl status returns this exit code if postgres is not running: in this case it is
// fine that stop failed. Otherwise it is an error that stop failed.
const PG_STATUS_NOT_RUNNING: i32 = 3;
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
println!("Attachment service data base is already stopped");
return Ok(());
} else {
anyhow::bail!("Failed to stop attachment service database: {stop_status}")
}
}
Ok(())
result
}
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(immediate, COMMAND, &self.pid_file())
}
/// Simple HTTP request wrapper for calling into attachment service
async fn dispatch<RQ, RS>(
&self,
@@ -525,15 +286,13 @@ impl AttachmentService {
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let listen_url = self.env.control_plane_api.clone().unwrap();
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
listen_url.host_str().unwrap(),
listen_url.port().unwrap()
))
.unwrap();
let url = self
.env
.control_plane_api
.clone()
.unwrap()
.join(&path)
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {
@@ -570,7 +329,7 @@ impl AttachmentService {
let response = self
.dispatch::<_, AttachHookResponse>(
Method::POST,
"debug/v1/attach-hook".to_string(),
"attach-hook".to_string(),
Some(request),
)
.await?;
@@ -586,11 +345,7 @@ impl AttachmentService {
let request = InspectRequest { tenant_shard_id };
let response = self
.dispatch::<_, InspectResponse>(
Method::POST,
"debug/v1/inspect".to_string(),
Some(request),
)
.dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
.await?;
Ok(response.attachment)
@@ -601,18 +356,14 @@ impl AttachmentService {
&self,
req: TenantCreateRequest,
) -> anyhow::Result<TenantCreateResponse> {
self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
self.dispatch(Method::POST, "tenant".to_string(), Some(req))
.await
}
#[instrument(skip(self))]
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
self.dispatch::<(), _>(
Method::GET,
format!("control/v1/tenant/{tenant_id}/locate"),
None,
)
.await
self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
.await
}
#[instrument(skip(self))]
@@ -634,7 +385,7 @@ impl AttachmentService {
#[instrument(skip_all, fields(node_id=%req.node_id))]
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
.await
}
@@ -642,7 +393,7 @@ impl AttachmentService {
pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
self.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/node/{}/config", req.node_id),
format!("node/{}/config", req.node_id),
Some(req),
)
.await
@@ -662,7 +413,7 @@ impl AttachmentService {
) -> anyhow::Result<TimelineInfo> {
self.dispatch(
Method::POST,
format!("v1/tenant/{tenant_id}/timeline"),
format!("tenant/{tenant_id}/timeline"),
Some(req),
)
.await

View File

@@ -17,7 +17,7 @@ use std::io::Write;
use std::os::unix::prelude::AsRawFd;
use std::os::unix::process::CommandExt;
use std::path::Path;
use std::process::Command;
use std::process::{Child, Command};
use std::time::Duration;
use std::{fs, io, thread};
@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
envs: EI,
initial_pid_file: InitialPidFile,
process_status_check: F,
) -> anyhow::Result<()>
) -> anyhow::Result<Child>
where
F: Fn() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<bool>>,
@@ -98,7 +98,7 @@ where
InitialPidFile::Expect(path) => path,
};
let spawned_process = filled_cmd.spawn().with_context(|| {
let mut spawned_process = filled_cmd.spawn().with_context(|| {
format!("Could not spawn {process_name}, see console output and log files for details.")
})?;
let pid = spawned_process.id();
@@ -106,26 +106,12 @@ where
i32::try_from(pid)
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
);
// set up a scopeguard to kill & wait for the child in case we panic or bail below
let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
println!("SIGKILL & wait the started process");
(|| {
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
spawned_process.kill().context("SIGKILL child")?;
spawned_process.wait().context("wait() for child process")?;
anyhow::Ok(())
})()
.with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
.unwrap();
});
for retries in 0..RETRIES {
match process_started(pid, pid_file_to_check, &process_status_check).await {
Ok(true) => {
println!("\n{process_name} started and passed status check, pid: {pid}");
// leak the child process, it'll outlive this neon_local invocation
drop(scopeguard::ScopeGuard::into_inner(spawned_process));
return Ok(());
println!("\n{process_name} started, pid: {pid}");
return Ok(spawned_process);
}
Ok(false) => {
if retries == NOTICE_AFTER_RETRIES {
@@ -140,15 +126,16 @@ where
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
}
Err(e) => {
println!("error starting process {process_name:?}: {e:#}");
println!("{process_name} failed to start: {e:#}");
if let Err(e) = spawned_process.kill() {
println!("Could not stop {process_name} subprocess: {e:#}")
};
return Err(e);
}
}
}
println!();
anyhow::bail!(
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
);
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -256,9 +243,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
for env_key in [
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_PROFILE",
// HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
"HOME",
"AWS_SESSION_TOKEN",
"AZURE_STORAGE_ACCOUNT",
"AZURE_STORAGE_ACCESS_KEY",
] {

View File

@@ -51,7 +51,7 @@ project_git_version!(GIT_VERSION);
const DEFAULT_PG_VERSION: &str = "15";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
fn default_conf(num_pageservers: u16) -> String {
let mut template = format!(
@@ -135,7 +135,7 @@ fn main() -> Result<()> {
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(sub_args, &env)),
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
"stop" => handle_stop_all(sub_args, &env),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
@@ -1056,9 +1056,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
if let Err(e) = get_pageserver(env, subcommand_args)?
.start(&pageserver_config_overrides(subcommand_args), *register)
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
@@ -1087,7 +1086,24 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
}
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args), false)
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
}
}
Some(("migrate", subcommand_args)) => {
let pageserver = get_pageserver(env, subcommand_args)?;
//TODO what shutdown strategy should we use here?
if let Err(e) = pageserver.stop(false) {
eprintln!("pageserver stop failed: {}", e);
exit(1);
}
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
@@ -1145,7 +1161,7 @@ async fn handle_attachment_service(
.map(|s| s.as_str())
== Some("immediate");
if let Err(e) = svc.stop(immediate).await {
if let Err(e) = svc.stop(immediate) {
eprintln!("stop failed: {}", e);
exit(1);
}
@@ -1241,7 +1257,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
let attachment_service = AttachmentService::from_env(env);
if let Err(e) = attachment_service.start().await {
eprintln!("attachment_service start failed: {:#}", e);
try_stop_all(env, true).await;
try_stop_all(env, true);
exit(1);
}
}
@@ -1249,11 +1265,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver
.start(&pageserver_config_overrides(sub_match), true)
.start(&pageserver_config_overrides(sub_match))
.await
{
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true).await;
try_stop_all(env, true);
exit(1);
}
}
@@ -1262,23 +1278,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start(vec![]).await {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false).await;
try_stop_all(env, false);
exit(1);
}
}
Ok(())
}
async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let immediate =
sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
try_stop_all(env, immediate).await;
try_stop_all(env, immediate);
Ok(())
}
async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
// Stop all endpoints
match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => {
@@ -1313,7 +1329,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
if env.control_plane_api.is_some() {
let attachment_service = AttachmentService::from_env(env);
if let Err(e) = attachment_service.stop(immediate).await {
if let Err(e) = attachment_service.stop(immediate) {
eprintln!("attachment service stop failed: {e:#}");
}
}
@@ -1533,11 +1549,7 @@ fn cli() -> Command {
.subcommand(Command::new("status"))
.subcommand(Command::new("start")
.about("Start local pageserver")
.arg(pageserver_config_args.clone()).arg(Arg::new("register")
.long("register")
.default_value("true").required(false)
.value_parser(value_parser!(bool))
.value_name("register"))
.arg(pageserver_config_args.clone())
)
.subcommand(Command::new("stop")
.about("Stop local pageserver")

View File

@@ -57,7 +57,7 @@ use crate::local_env::LocalEnv;
use crate::postgresql_conf::PostgresConf;
use compute_api::responses::{ComputeState, ComputeStatus};
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
// contents of a endpoint.json file
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -70,7 +70,6 @@ pub struct EndpointConf {
http_port: u16,
pg_version: u32,
skip_pg_catalog_updates: bool,
features: Vec<ComputeFeature>,
}
//
@@ -141,7 +140,6 @@ impl ComputeControlPlane {
// with this we basically test a case of waking up an idle compute, where
// we also skip catalog updates in the cloud.
skip_pg_catalog_updates: true,
features: vec![],
});
ep.create_endpoint_dir()?;
@@ -156,7 +154,6 @@ impl ComputeControlPlane {
pg_port,
pg_version,
skip_pg_catalog_updates: true,
features: vec![],
})?,
)?;
std::fs::write(
@@ -218,9 +215,6 @@ pub struct Endpoint {
// Optimizations
skip_pg_catalog_updates: bool,
// Feature flags
features: Vec<ComputeFeature>,
}
impl Endpoint {
@@ -250,7 +244,6 @@ impl Endpoint {
tenant_id: conf.tenant_id,
pg_version: conf.pg_version,
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
features: conf.features,
})
}
@@ -438,7 +431,7 @@ impl Endpoint {
}
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
// TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
// TODO use background_process::stop_process instead
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
let pid = nix::unistd::Pid::from_raw(pid as i32);
@@ -526,7 +519,7 @@ impl Endpoint {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
features: self.features.clone(),
features: vec![],
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
@@ -583,21 +576,9 @@ impl Endpoint {
}
let child = cmd.spawn()?;
// set up a scopeguard to kill & wait for the child in case we panic or bail below
let child = scopeguard::guard(child, |mut child| {
println!("SIGKILL & wait the started process");
(|| {
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned
child.kill().context("SIGKILL child")?;
child.wait().context("wait() for child process")?;
anyhow::Ok(())
})()
.with_context(|| format!("scopeguard kill&wait child {child:?}"))
.unwrap();
});
// Write down the pid so we can wait for it when we want to stop
// TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
// TODO use background_process::start_process instead
let pid = child.id();
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
std::fs::write(pidfile_path, pid.to_string())?;
@@ -646,9 +627,6 @@ impl Endpoint {
std::thread::sleep(ATTEMPT_INTERVAL);
}
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
drop(scopeguard::ScopeGuard::into_inner(child));
Ok(())
}

View File

@@ -223,11 +223,7 @@ impl LocalEnv {
}
pub fn attachment_service_bin(&self) -> PathBuf {
// Irrespective of configuration, attachment service binary is always
// run from the same location as neon_local. This means that for compatibility
// tests that run old pageserver/safekeeper, they still run latest attachment service.
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
neon_local_bin_dir.join("attachment_service")
self.neon_distrib_dir.join("attachment_service")
}
pub fn safekeeper_bin(&self) -> PathBuf {

View File

@@ -11,7 +11,7 @@ use std::io;
use std::io::Write;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::Command;
use std::process::{Child, Command};
use std::time::Duration;
use anyhow::{bail, Context};
@@ -30,7 +30,6 @@ use utils::{
lsn::Lsn,
};
use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
use crate::local_env::PageServerConf;
use crate::{background_process, local_env::LocalEnv};
@@ -162,8 +161,8 @@ impl PageServerNode {
.expect("non-Unicode path")
}
pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
self.start_node(config_overrides, false, register).await
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
self.start_node(config_overrides, false).await
}
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -208,8 +207,7 @@ impl PageServerNode {
&self,
config_overrides: &[&str],
update_config: bool,
register: bool,
) -> anyhow::Result<()> {
) -> anyhow::Result<Child> {
// TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path();
print!(
@@ -246,26 +244,7 @@ impl PageServerNode {
}
},
)
.await?;
if register {
let attachment_service = AttachmentService::from_env(&self.env);
let (pg_host, pg_port) =
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
attachment_service
.node_register(NodeRegisterRequest {
node_id: self.conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
}
Ok(())
.await
}
fn pageserver_basic_args<'a>(
@@ -395,11 +374,6 @@ impl PageServerNode {
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'lazy_slru_download' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -500,11 +474,6 @@ impl PageServerNode {
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'lazy_slru_download' as bool")?,
}
};

View File

@@ -7,6 +7,7 @@
//! ```
use std::io::Write;
use std::path::PathBuf;
use std::process::Child;
use std::{io, result};
use anyhow::Context;
@@ -103,7 +104,7 @@ impl SafekeeperNode {
.expect("non-Unicode path")
}
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
print!(
"Starting safekeeper at '{}' in '{}'",
self.pg_connection_config.raw_address(),

View File

@@ -1,9 +0,0 @@
# For documentation on how to configure this file,
# see https://diesel.rs/guides/configuring-diesel-cli
[print_schema]
file = "control_plane/attachment_service/src/schema.rs"
custom_type_derives = ["diesel::query_builder::QueryId"]
[migrations_directory]
dir = "control_plane/attachment_service/migrations"

View File

@@ -90,9 +90,6 @@ pub enum ComputeFeature {
/// track short-lived connections as user activity.
ActivityMonitorExperimental,
/// Enable running migrations
Migrations,
/// This is a special feature flag that is used to represent unknown feature flags.
/// Basically all unknown to enum flags are represented as this one. See unit test
/// `parse_unknown_features()` for more details.

View File

@@ -9,10 +9,5 @@ prometheus.workspace = true
libc.workspace = true
once_cell.workspace = true
chrono.workspace = true
twox-hash.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
rand = "0.8"
rand_distr = "0.4.3"

View File

@@ -1,523 +0,0 @@
//! HyperLogLog is an algorithm for the count-distinct problem,
//! approximating the number of distinct elements in a multiset.
//! Calculating the exact cardinality of the distinct elements
//! of a multiset requires an amount of memory proportional to
//! the cardinality, which is impractical for very large data sets.
//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
//! use significantly less memory than this, but can only approximate the cardinality.
use std::{
collections::HashMap,
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
sync::{atomic::AtomicU8, Arc, RwLock},
};
use prometheus::{
core::{self, Describer},
proto, Opts,
};
use twox_hash::xxh3;
/// Create an [`HyperLogLogVec`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_hll_vec {
($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
$crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
}};
($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
$crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
}};
}
/// Create an [`HyperLogLog`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_hll {
($N:literal, $OPTS:expr $(,)?) => {{
let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
$crate::register(Box::new(hll.clone())).map(|_| hll)
}};
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
}};
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLogVec<const N: usize> {
core: Arc<HyperLogLogVecCore<N>>,
}
struct HyperLogLogVecCore<const N: usize> {
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
pub desc: core::Desc,
pub opts: Opts,
}
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
for child in self.core.children.read().unwrap().values() {
child.core.collect_into(&mut metrics);
}
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogVec<N> {
/// Create a new [`HyperLogLogVec`] based on the provided
/// [`Opts`] and partitioned by the given label names. At least one label name must be
/// provided.
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
let opts = opts.variable_labels(variable_names);
let desc = opts.describe()?;
let v = HyperLogLogVecCore {
children: RwLock::new(HashMap::default()),
desc,
opts,
};
Ok(Self { core: Arc::new(v) })
}
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
self.core.get_metric_with_label_values(vals)
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
self.get_metric_with_label_values(vals).unwrap()
}
}
impl<const N: usize> HyperLogLogVecCore<N> {
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let h = self.hash_label_values(vals)?;
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
return Ok(metric);
}
self.get_or_create_metric(h, vals)
}
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
if vals.len() != self.desc.variable_labels.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: self.desc.variable_labels.len(),
got: vals.len(),
});
}
let mut h = xxh3::Hash64::default();
for val in vals {
h.write(val.as_bytes());
}
Ok(h.finish())
}
fn get_or_create_metric(
&self,
hash: u64,
label_values: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let mut children = self.children.write().unwrap();
// Check exist first.
if let Some(metric) = children.get(&hash).cloned() {
return Ok(metric);
}
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
children.insert(hash, metric.clone());
Ok(metric)
}
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLog<const N: usize> {
core: Arc<HyperLogLogCore<N>>,
}
impl<const N: usize> HyperLogLog<N> {
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let opts = Opts::new(name, help);
Self::with_opts(opts)
}
/// Create a [`HyperLogLog`] with the `opts` options.
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
Self::with_opts_and_label_values(&opts, &[])
}
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
let desc = opts.describe()?;
let labels = make_label_pairs(&desc, label_values)?;
let v = HyperLogLogCore {
shards: [0; N].map(AtomicU8::new),
desc,
labels,
};
Ok(Self { core: Arc::new(v) })
}
pub fn measure(&self, item: &impl Hash) {
// changing the hasher will break compatibility with previous measurements.
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
}
fn record(&self, hash: u64) {
let p = N.ilog2() as u8;
let j = hash & (N as u64 - 1);
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
}
}
struct HyperLogLogCore<const N: usize> {
shards: [AtomicU8; N],
desc: core::Desc,
labels: Vec<proto::LabelPair>,
}
impl<const N: usize> core::Collector for HyperLogLog<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
self.core.collect_into(&mut metrics);
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogCore<N> {
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
self.shards.iter().enumerate().for_each(|(i, x)| {
let mut shard_label = proto::LabelPair::default();
shard_label.set_name("hll_shard".to_owned());
shard_label.set_value(format!("{i}"));
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
// This seems like it would be a race condition,
// but HLL is not impacted by a write in one shard happening in between.
// This is because in PromQL we will be implementing a harmonic mean of all buckets.
// we will also merge samples in a time series using `max by (hll_shard)`.
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
let mut m = proto::Metric::default();
let mut c = proto::Gauge::default();
c.set_value(v as f64);
m.set_gauge(c);
let mut labels = Vec::with_capacity(self.labels.len() + 1);
labels.extend_from_slice(&self.labels);
labels.push(shard_label);
m.set_label(labels);
metrics.push(m);
})
}
}
fn make_label_pairs(
desc: &core::Desc,
label_values: &[&str],
) -> prometheus::Result<Vec<proto::LabelPair>> {
if desc.variable_labels.len() != label_values.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: desc.variable_labels.len(),
got: label_values.len(),
});
}
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
if total_len == 0 {
return Ok(vec![]);
}
if desc.variable_labels.is_empty() {
return Ok(desc.const_label_pairs.clone());
}
let mut label_pairs = Vec::with_capacity(total_len);
for (i, n) in desc.variable_labels.iter().enumerate() {
let mut label_pair = proto::LabelPair::default();
label_pair.set_name(n.clone());
label_pair.set_value(label_values[i].to_owned());
label_pairs.push(label_pair);
}
for label_pair in &desc.const_label_pairs {
label_pairs.push(label_pair.clone());
}
label_pairs.sort();
Ok(label_pairs)
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use prometheus::{proto, Opts};
use rand::{rngs::StdRng, Rng, SeedableRng};
use rand_distr::{Distribution, Zipf};
use crate::HyperLogLogVec;
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
let mut metrics = vec![];
hll.core
.children
.read()
.unwrap()
.values()
.for_each(|c| c.core.collect_into(&mut metrics));
metrics
}
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
let mut buckets = [0.0; 32];
for metric in metrics.chunks_exact(32) {
if filter(&metric[0]) {
for (i, m) in metric.iter().enumerate() {
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
}
}
}
buckets
.into_iter()
.map(|f| 2.0f64.powf(-f))
.sum::<f64>()
.recip()
* 0.697
* 32.0
* 32.0
}
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
let mut set_a = HashSet::new();
let mut set_b = HashSet::new();
for x in iter.by_ref().take(n) {
set_a.insert(x.to_bits());
hll.with_label_values(&["a"]).measure(&x.to_bits());
}
for x in iter.by_ref().take(n) {
set_b.insert(x.to_bits());
hll.with_label_values(&["b"]).measure(&x.to_bits());
}
let merge = &set_a | &set_b;
let metrics = collect(&hll);
let len = get_cardinality(&metrics, |_| true);
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
}
#[test]
fn test_cardinality_small() {
let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
assert_eq!(actual, [46, 30, 32]);
assert!(51.3 < estimate[0] && estimate[0] < 51.4);
assert!(44.0 < estimate[1] && estimate[1] < 44.1);
assert!(39.0 < estimate[2] && estimate[2] < 39.1);
}
#[test]
fn test_cardinality_medium() {
let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
assert_eq!(actual, [2529, 1618, 1629]);
assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
}
#[test]
fn test_cardinality_large() {
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
assert_eq!(actual, [129077, 79579, 79630]);
assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
}
#[test]
fn test_cardinality_small2() {
let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
assert_eq!(actual, [92, 58, 60]);
assert!(116.1 < estimate[0] && estimate[0] < 116.2);
assert!(81.7 < estimate[1] && estimate[1] < 81.8);
assert!(69.3 < estimate[2] && estimate[2] < 69.4);
}
#[test]
fn test_cardinality_medium2() {
let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
assert_eq!(actual, [8201, 5131, 5051]);
assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
}
#[test]
fn test_cardinality_large2() {
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
assert_eq!(actual, [777847, 482069, 482246]);
assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
}
}

View File

@@ -28,9 +28,7 @@ use prometheus::{Registry, Result};
pub mod launch_timestamp;
mod wrappers;
pub use wrappers::{CountedReader, CountedWriter};
mod hll;
pub mod metric_vec_duration;
pub use hll::{HyperLogLog, HyperLogLogVec};
pub type UIntGauge = GenericGauge<AtomicU64>;
pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;

View File

@@ -20,7 +20,6 @@ strum_macros.workspace = true
hex.workspace = true
thiserror.workspace = true
humantime-serde.workspace = true
chrono.workspace = true
workspace_hack.workspace = true

View File

@@ -1,11 +1,9 @@
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize};
use std::{fmt, ops::Range};
use std::fmt;
use crate::reltag::{BlockNumber, RelTag, SlruKind};
use crate::reltag::{BlockNumber, RelTag};
/// Key used in the Repository kv-store.
///
@@ -145,390 +143,12 @@ impl Key {
}
}
// Layout of the Key address space
//
// The Key struct, used to address the underlying key-value store, consists of
// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
// all the data and metadata keys into those 18 bytes.
//
// Principles for the mapping:
//
// - Things that are often accessed or modified together, should be close to
// each other in the key space. For example, if a relation is extended by one
// block, we create a new key-value pair for the block data, and update the
// relation size entry. Because of that, the RelSize key comes after all the
// RelBlocks of a relation: the RelSize and the last RelBlock are always next
// to each other.
//
// The key space is divided into four major sections, identified by the first
// byte, and the form a hierarchy:
//
// 00 Relation data and metadata
//
// DbDir () -> (dbnode, spcnode)
// Filenodemap
// RelDir -> relnode forknum
// RelBlocks
// RelSize
//
// 01 SLRUs
//
// SlruDir kind
// SlruSegBlocks segno
// SlruSegSize
//
// 02 pg_twophase
//
// 03 misc
// Controlfile
// checkpoint
// pg_version
//
// 04 aux files
//
// Below is a full list of the keyspace allocation:
//
// DbDir:
// 00 00000000 00000000 00000000 00 00000000
//
// Filenodemap:
// 00 SPCNODE DBNODE 00000000 00 00000000
//
// RelDir:
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
//
// RelBlock:
// 00 SPCNODE DBNODE RELNODE FORK BLKNUM
//
// RelSize:
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
//
// SlruDir:
// 01 kind 00000000 00000000 00 00000000
//
// SlruSegBlock:
// 01 kind 00000001 SEGNO 00 BLKNUM
//
// SlruSegSize:
// 01 kind 00000001 SEGNO 00 FFFFFFFF
//
// TwoPhaseDir:
// 02 00000000 00000000 00000000 00 00000000
//
// TwoPhaseFile:
// 02 00000000 00000000 00000000 00 XID
//
// ControlFile:
// 03 00000000 00000000 00000000 00 00000000
//
// Checkpoint:
// 03 00000000 00000000 00000000 00 00000001
//
// AuxFiles:
// 03 00000000 00000000 00000000 00 00000002
//
//-- Section 01: relation data and metadata
pub const DBDIR_KEY: Key = Key {
field1: 0x00,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
};
#[inline(always)]
pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0xffffffff,
field5: 0xff,
field6: 0xffffffff,
}
}
#[inline(always)]
pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 0,
}
}
#[inline(always)]
pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 1,
}
}
#[inline(always)]
pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: blknum,
}
}
#[inline(always)]
pub fn rel_size_to_key(rel: RelTag) -> Key {
Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: 0xffffffff,
}
}
#[inline(always)]
pub fn rel_key_range(rel: RelTag) -> Range<Key> {
Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: 0,
}..Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum + 1,
field6: 0,
}
}
//-- Section 02: SLRUs
#[inline(always)]
pub fn slru_dir_to_key(kind: SlruKind) -> Key {
Key {
field1: 0x01,
field2: match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
},
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
#[inline(always)]
pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
Key {
field1: 0x01,
field2: match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
},
field3: 1,
field4: segno,
field5: 0,
field6: blknum,
}
}
#[inline(always)]
pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
Key {
field1: 0x01,
field2: match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
},
field3: 1,
field4: segno,
field5: 0,
field6: 0xffffffff,
}
}
#[inline(always)]
pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
let field2 = match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
};
Key {
field1: 0x01,
field2,
field3: 1,
field4: segno,
field5: 0,
field6: 0,
}..Key {
field1: 0x01,
field2,
field3: 1,
field4: segno,
field5: 1,
field6: 0,
}
}
//-- Section 03: pg_twophase
pub const TWOPHASEDIR_KEY: Key = Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
};
#[inline(always)]
pub fn twophase_file_key(xid: TransactionId) -> Key {
Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: xid,
}
}
#[inline(always)]
pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
let (next_xid, overflowed) = xid.overflowing_add(1);
Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: xid,
}..Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: u8::from(overflowed),
field6: next_xid,
}
}
//-- Section 03: Control file
pub const CONTROLFILE_KEY: Key = Key {
field1: 0x03,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
};
pub const CHECKPOINT_KEY: Key = Key {
field1: 0x03,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 1,
};
pub const AUX_FILES_KEY: Key = Key {
field1: 0x03,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 2,
};
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
#[inline(always)]
pub fn is_inherited_key(key: Key) -> bool {
key != AUX_FILES_KEY
}
#[inline(always)]
pub fn is_rel_fsm_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
}
#[inline(always)]
pub fn is_rel_vm_block_key(key: Key) -> bool {
key.field1 == 0x00
&& key.field4 != 0
&& key.field5 == VISIBILITYMAP_FORKNUM
&& key.field6 != 0xffffffff
}
#[inline(always)]
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
Ok(match key.field1 {
0x01 => {
let kind = match key.field2 {
0x00 => SlruKind::Clog,
0x01 => SlruKind::MultiXactMembers,
0x02 => SlruKind::MultiXactOffsets,
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
};
let segno = key.field4;
let blknum = key.field6;
(kind, segno, blknum)
}
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
#[inline(always)]
pub fn is_slru_block_key(key: Key) -> bool {
key.field1 == 0x01 // SLRU-related
&& key.field3 == 0x00000001 // but not SlruDir
&& key.field6 != 0xffffffff // and not SlruSegSize
}
#[inline(always)]
pub fn is_rel_block_key(key: &Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
}
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
#[inline(always)]
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (

View File

@@ -63,84 +63,16 @@ impl KeySpace {
KeyPartitioning { parts }
}
/// Update the keyspace such that it doesn't contain any range
/// that is overlapping with `other`. This can involve splitting or
/// removing of existing ranges.
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
let (self_start, self_end) = match (self.start(), self.end()) {
(Some(start), Some(end)) => (start, end),
_ => {
// self is empty
return;
}
};
// Key spaces are sorted by definition, so skip ahead to the first
// potentially intersecting range. Similarly, ignore ranges that start
// after the current keyspace ends.
let other_ranges = other
.ranges
.iter()
.skip_while(|range| self_start >= range.end)
.take_while(|range| self_end > range.start);
for range in other_ranges {
while let Some(overlap_at) = self.overlaps_at(range) {
let overlapped = self.ranges[overlap_at].clone();
if overlapped.start < range.start && overlapped.end <= range.end {
// Higher part of the range is completely overlapped.
self.ranges[overlap_at].end = range.start;
}
if overlapped.start >= range.start && overlapped.end > range.end {
// Lower part of the range is completely overlapped.
self.ranges[overlap_at].start = range.end;
}
if overlapped.start < range.start && overlapped.end > range.end {
// Middle part of the range is overlapped.
self.ranges[overlap_at].end = range.start;
self.ranges
.insert(overlap_at + 1, range.end..overlapped.end);
}
if overlapped.start >= range.start && overlapped.end <= range.end {
// Whole range is overlapped
self.ranges.remove(overlap_at);
}
}
}
}
pub fn start(&self) -> Option<Key> {
self.ranges.first().map(|range| range.start)
}
pub fn end(&self) -> Option<Key> {
self.ranges.last().map(|range| range.end)
}
#[allow(unused)]
pub fn total_size(&self) -> usize {
self.ranges
.iter()
.map(|range| key_range_size(range) as usize)
.sum()
}
fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
Ok(0) => None,
Err(0) => None,
Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
_ => None,
}
}
///
/// Check if key space contains overlapping range
///
pub fn overlaps(&self, range: &Range<Key>) -> bool {
self.overlaps_at(range).is_some()
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
Ok(0) => false,
Err(0) => false,
Ok(index) => self.ranges[index - 1].end > range.start,
Err(index) => self.ranges[index - 1].end > range.start,
}
}
}
@@ -172,7 +104,6 @@ pub struct KeySpaceAccum {
accum: Option<Range<Key>>,
ranges: Vec<Range<Key>>,
size: u64,
}
impl KeySpaceAccum {
@@ -180,7 +111,6 @@ impl KeySpaceAccum {
Self {
accum: None,
ranges: Vec::new(),
size: 0,
}
}
@@ -191,8 +121,6 @@ impl KeySpaceAccum {
#[inline(always)]
pub fn add_range(&mut self, range: Range<Key>) {
self.size += key_range_size(&range) as u64;
match self.accum.as_mut() {
Some(accum) => {
if range.start == accum.end {
@@ -218,23 +146,6 @@ impl KeySpaceAccum {
ranges: self.ranges,
}
}
pub fn consume_keyspace(&mut self) -> KeySpace {
if let Some(accum) = self.accum.take() {
self.ranges.push(accum);
}
let mut prev_accum = KeySpaceAccum::new();
std::mem::swap(self, &mut prev_accum);
KeySpace {
ranges: prev_accum.ranges,
}
}
pub fn size(&self) -> u64 {
self.size
}
}
///
@@ -343,30 +254,6 @@ mod tests {
}
}
#[test]
fn keyspace_consume() {
let ranges = vec![kr(0..10), kr(20..35), kr(40..45)];
let mut accum = KeySpaceAccum::new();
for range in &ranges {
accum.add_range(range.clone());
}
let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
assert_eq!(accum.size(), expected_size);
assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
assert_eq!(accum.size(), 0);
assert_ks_eq(&accum.consume_keyspace(), vec![]);
assert_eq!(accum.size(), 0);
for range in &ranges {
accum.add_range(range.clone());
}
assert_ks_eq(&accum.to_keyspace(), ranges);
}
#[test]
fn keyspace_add_range() {
// two separate ranges
@@ -509,118 +396,4 @@ mod tests {
// xxxxxxxxxxx
assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
}
#[test]
fn test_remove_full_overlapps() {
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(4),
Key::from_i128(5)..Key::from_i128(8),
Key::from_i128(10)..Key::from_i128(12),
],
};
let key_space2 = KeySpace {
ranges: vec![
Key::from_i128(2)..Key::from_i128(3),
Key::from_i128(6)..Key::from_i128(7),
Key::from_i128(11)..Key::from_i128(13),
],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(2),
Key::from_i128(3)..Key::from_i128(4),
Key::from_i128(5)..Key::from_i128(6),
Key::from_i128(7)..Key::from_i128(8),
Key::from_i128(10)..Key::from_i128(11)
]
);
}
#[test]
fn test_remove_partial_overlaps() {
// Test partial ovelaps
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(5),
Key::from_i128(7)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
],
};
let key_space2 = KeySpace {
ranges: vec![
Key::from_i128(3)..Key::from_i128(6),
Key::from_i128(8)..Key::from_i128(11),
Key::from_i128(14)..Key::from_i128(17),
],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(3),
Key::from_i128(7)..Key::from_i128(8),
Key::from_i128(12)..Key::from_i128(14),
]
);
}
#[test]
fn test_remove_no_overlaps() {
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(5),
Key::from_i128(7)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
],
};
let key_space2 = KeySpace {
ranges: vec![
Key::from_i128(6)..Key::from_i128(7),
Key::from_i128(11)..Key::from_i128(12),
Key::from_i128(15)..Key::from_i128(17),
],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(5),
Key::from_i128(7)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
]
);
}
#[test]
fn test_remove_one_range_overlaps_multiple() {
let mut key_space1 = KeySpace {
ranges: vec![
Key::from_i128(1)..Key::from_i128(3),
Key::from_i128(3)..Key::from_i128(6),
Key::from_i128(6)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
Key::from_i128(17)..Key::from_i128(20),
Key::from_i128(20)..Key::from_i128(30),
Key::from_i128(30)..Key::from_i128(40),
],
};
let key_space2 = KeySpace {
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
};
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
Key::from_i128(1)..Key::from_i128(3),
Key::from_i128(3)..Key::from_i128(6),
Key::from_i128(6)..Key::from_i128(9),
Key::from_i128(19)..Key::from_i128(20),
Key::from_i128(20)..Key::from_i128(30),
Key::from_i128(30)..Key::from_i128(40),
]
);
}
}

View File

@@ -8,7 +8,6 @@ use std::{
};
use byteorder::{BigEndian, ReadBytesExt};
use postgres_ffi::BLCKSZ;
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use strum_macros;
@@ -272,7 +271,6 @@ pub struct TenantConfig {
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
pub heatmap_period: Option<String>,
pub lazy_slru_download: Option<bool>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -366,19 +364,6 @@ pub struct TenantLocationConfigRequest {
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantShardLocation {
pub shard_id: TenantShardId,
pub node_id: NodeId,
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLocationConfigResponse {
pub shards: Vec<TenantShardLocation>,
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantConfigRequest {
@@ -454,8 +439,6 @@ pub struct TenantDetails {
#[serde(flatten)]
pub tenant_info: TenantInfo,
pub walredo: Option<WalRedoManagerStatus>,
pub timelines: Vec<TimelineId>,
}
@@ -643,12 +626,6 @@ pub struct TimelineGcRequest {
pub gc_horizon: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerStatus {
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
pub pid: Option<u32>,
}
// Wrapped in libpq CopyData
#[derive(PartialEq, Eq, Debug)]
pub enum PagestreamFeMessage {
@@ -656,7 +633,6 @@ pub enum PagestreamFeMessage {
Nblocks(PagestreamNblocksRequest),
GetPage(PagestreamGetPageRequest),
DbSize(PagestreamDbSizeRequest),
GetSlruSegment(PagestreamGetSlruSegmentRequest),
}
// Wrapped in libpq CopyData
@@ -667,7 +643,6 @@ pub enum PagestreamBeMessage {
GetPage(PagestreamGetPageResponse),
Error(PagestreamErrorResponse),
DbSize(PagestreamDbSizeResponse),
GetSlruSegment(PagestreamGetSlruSegmentResponse),
}
// Keep in sync with `pagestore_client.h`
@@ -678,7 +653,6 @@ enum PagestreamBeMessageTag {
GetPage = 102,
Error = 103,
DbSize = 104,
GetSlruSegment = 105,
}
impl TryFrom<u8> for PagestreamBeMessageTag {
type Error = u8;
@@ -689,7 +663,6 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
102 => Ok(PagestreamBeMessageTag::GetPage),
103 => Ok(PagestreamBeMessageTag::Error),
104 => Ok(PagestreamBeMessageTag::DbSize),
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
_ => Err(value),
}
}
@@ -724,14 +697,6 @@ pub struct PagestreamDbSizeRequest {
pub dbnode: u32,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetSlruSegmentRequest {
pub latest: bool,
pub lsn: Lsn,
pub kind: u8,
pub segno: u32,
}
#[derive(Debug)]
pub struct PagestreamExistsResponse {
pub exists: bool,
@@ -747,11 +712,6 @@ pub struct PagestreamGetPageResponse {
pub page: Bytes,
}
#[derive(Debug)]
pub struct PagestreamGetSlruSegmentResponse {
pub segment: Bytes,
}
#[derive(Debug)]
pub struct PagestreamErrorResponse {
pub message: String,
@@ -815,14 +775,6 @@ impl PagestreamFeMessage {
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.dbnode);
}
Self::GetSlruSegment(req) => {
bytes.put_u8(4);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u8(req.kind);
bytes.put_u32(req.segno);
}
}
bytes.into()
@@ -873,14 +825,6 @@ impl PagestreamFeMessage {
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
dbnode: body.read_u32::<BigEndian>()?,
})),
4 => Ok(PagestreamFeMessage::GetSlruSegment(
PagestreamGetSlruSegmentRequest {
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
kind: body.read_u8()?,
segno: body.read_u32::<BigEndian>()?,
},
)),
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
}
}
@@ -916,12 +860,6 @@ impl PagestreamBeMessage {
bytes.put_u8(Tag::DbSize as u8);
bytes.put_i64(resp.db_size);
}
Self::GetSlruSegment(resp) => {
bytes.put_u8(Tag::GetSlruSegment as u8);
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
}
bytes.into()
@@ -962,14 +900,6 @@ impl PagestreamBeMessage {
let db_size = buf.read_i64::<BigEndian>()?;
Self::DbSize(PagestreamDbSizeResponse { db_size })
}
Tag::GetSlruSegment => {
let n_blocks = buf.read_u32::<BigEndian>()?;
let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
buf.read_exact(&mut segment)?;
Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
segment: segment.into(),
})
}
};
let remaining = buf.into_inner();
if !remaining.is_empty() {
@@ -988,7 +918,6 @@ impl PagestreamBeMessage {
Self::GetPage(_) => "GetPage",
Self::Error(_) => "Error",
Self::DbSize(_) => "DbSize",
Self::GetSlruSegment(_) => "GetSlruSegment",
}
}
}

View File

@@ -111,23 +111,9 @@ impl RelTag {
/// These files are divided into segments, which are divided into
/// pages of the same BLCKSZ as used for relation files.
///
#[derive(
Debug,
Clone,
Copy,
Hash,
Serialize,
Deserialize,
PartialEq,
Eq,
PartialOrd,
Ord,
strum_macros::EnumIter,
strum_macros::FromRepr,
)]
#[repr(u8)]
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub enum SlruKind {
Clog = 0,
Clog,
MultiXactMembers,
MultiXactOffsets,
}

View File

@@ -207,16 +207,10 @@ pub fn find_end_of_wal(
let seg_offs = curr_lsn.segment_offset(wal_seg_size);
segment.seek(SeekFrom::Start(seg_offs as u64))?;
// loop inside segment
while curr_lsn.segment_number(wal_seg_size) == segno {
loop {
let bytes_read = segment.read(&mut buf)?;
if bytes_read == 0 {
debug!(
"find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
result,
seg_file_path,
curr_lsn.segment_offset(wal_seg_size)
);
return Ok(result);
break; // EOF
}
curr_lsn += bytes_read as u64;
decoder.feed_bytes(&buf[0..bytes_read]);

View File

@@ -8,7 +8,6 @@ use std::pin::Pin;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use std::time::SystemTime;
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
use anyhow::Result;
@@ -24,7 +23,6 @@ use futures::stream::Stream;
use futures_util::StreamExt;
use http_types::{StatusCode, Url};
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use crate::s3_bucket::RequestKind;
@@ -185,6 +183,7 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
}
}
#[async_trait::async_trait]
impl RemoteStorage for AzureBlobStorage {
async fn list(
&self,
@@ -372,20 +371,6 @@ impl RemoteStorage for AzureBlobStorage {
copy_status = status;
}
}
async fn time_travel_recover(
&self,
_prefix: Option<&RemotePath>,
_timestamp: SystemTime,
_done_if_after: SystemTime,
_cancel: CancellationToken,
) -> anyhow::Result<()> {
// TODO use Azure point in time recovery feature for this
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
Err(anyhow::anyhow!(
"time travel recovery for azure blob storage is not implemented"
))
}
}
pin_project_lite::pin_project! {

View File

@@ -25,7 +25,6 @@ use bytes::Bytes;
use futures::stream::Stream;
use serde::{Deserialize, Serialize};
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use toml_edit::Item;
use tracing::info;
@@ -143,7 +142,7 @@ pub struct Listing {
/// Storage (potentially remote) API to manage its state.
/// This storage tries to be unaware of any layered repository context,
/// providing basic CRUD operations for storage files.
#[allow(async_fn_in_trait)]
#[async_trait::async_trait]
pub trait RemoteStorage: Send + Sync + 'static {
/// Lists all top level subdirectories for a given prefix
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
@@ -211,15 +210,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
/// Copy a remote object inside a bucket from one path to another.
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
/// Resets the content of everything with the given prefix to the given state
async fn time_travel_recover(
&self,
prefix: Option<&RemotePath>,
timestamp: SystemTime,
done_if_after: SystemTime,
cancel: CancellationToken,
) -> anyhow::Result<()>;
}
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -272,15 +262,14 @@ impl std::error::Error for DownloadError {}
/// Every storage, currently supported.
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
#[derive(Clone)]
// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
pub enum GenericRemoteStorage {
LocalFs(LocalFs),
AwsS3(Arc<S3Bucket>),
AzureBlob(Arc<AzureBlobStorage>),
Unreliable(Other),
Unreliable(Arc<UnreliableWrapper>),
}
impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
impl GenericRemoteStorage {
pub async fn list(
&self,
prefix: Option<&RemotePath>,
@@ -397,33 +386,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
Self::Unreliable(s) => s.copy(from, to).await,
}
}
pub async fn time_travel_recover(
&self,
prefix: Option<&RemotePath>,
timestamp: SystemTime,
done_if_after: SystemTime,
cancel: CancellationToken,
) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => {
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
Self::AwsS3(s) => {
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
Self::AzureBlob(s) => {
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
Self::Unreliable(s) => {
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
}
}
}
impl GenericRemoteStorage {
@@ -434,12 +396,7 @@ impl GenericRemoteStorage {
Self::LocalFs(LocalFs::new(root.clone())?)
}
RemoteStorageKind::AwsS3(s3_config) => {
// The profile and access key id are only printed here for debugging purposes,
// their values don't indicate the eventually taken choice for auth.
let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "<none>".into());
let access_key_id =
std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
}
@@ -716,7 +673,6 @@ impl ConcurrencyLimiter {
RequestKind::List => &self.read,
RequestKind::Delete => &self.write,
RequestKind::Copy => &self.write,
RequestKind::TimeTravel => &self.write,
}
}

View File

@@ -4,7 +4,7 @@
//! This storage used in tests, but can also be used in cases when a certain persistent
//! volume is mounted to the local FS.
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
use anyhow::{bail, ensure, Context};
use bytes::Bytes;
@@ -14,7 +14,7 @@ use tokio::{
fs,
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
};
use tokio_util::{io::ReaderStream, sync::CancellationToken};
use tokio_util::io::ReaderStream;
use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
@@ -157,6 +157,7 @@ impl LocalFs {
}
}
#[async_trait::async_trait]
impl RemoteStorage for LocalFs {
async fn list(
&self,
@@ -422,17 +423,6 @@ impl RemoteStorage for LocalFs {
})?;
Ok(())
}
#[allow(clippy::diverging_sub_expression)]
async fn time_travel_recover(
&self,
_prefix: Option<&RemotePath>,
_timestamp: SystemTime,
_done_if_after: SystemTime,
_cancel: CancellationToken,
) -> anyhow::Result<()> {
unimplemented!()
}
}
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {

View File

@@ -6,14 +6,12 @@
use std::{
borrow::Cow,
collections::HashMap,
pin::Pin,
sync::Arc,
task::{Context, Poll},
time::SystemTime,
};
use anyhow::{anyhow, Context as _};
use anyhow::Context as _;
use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider,
@@ -29,19 +27,17 @@ use aws_sdk_s3::{
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
error::SdkError,
operation::get_object::GetObjectError,
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
types::{Delete, ObjectIdentifier},
Client,
};
use aws_smithy_async::rt::sleep::TokioSleep;
use aws_smithy_types::body::SdkBody;
use aws_smithy_types::byte_stream::ByteStream;
use aws_smithy_types::{body::SdkBody, DateTime};
use bytes::Bytes;
use futures::stream::Stream;
use hyper::Body;
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use utils::backoff;
use super::StorageMetadata;
use crate::{
@@ -274,59 +270,6 @@ impl S3Bucket {
}
}
}
async fn delete_oids(
&self,
kind: RequestKind,
delete_objects: &[ObjectIdentifier],
) -> anyhow::Result<()> {
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
let started_at = start_measuring_requests(kind);
let resp = self
.client
.delete_objects()
.bucket(self.bucket_name.clone())
.delete(
Delete::builder()
.set_objects(Some(chunk.to_vec()))
.build()?,
)
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &resp, started_at);
let resp = resp?;
metrics::BUCKET_METRICS
.deleted_objects_total
.inc_by(chunk.len() as u64);
if let Some(errors) = resp.errors {
// Log a bounded number of the errors within the response:
// these requests can carry 1000 keys so logging each one
// would be too verbose, especially as errors may lead us
// to retry repeatedly.
const LOG_UP_TO_N_ERRORS: usize = 10;
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
tracing::warn!(
"DeleteObjects key {} failed: {}: {}",
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
e.message.as_ref().map(Cow::from).unwrap_or("".into())
);
}
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
));
}
}
Ok(())
}
}
pin_project_lite::pin_project! {
@@ -430,6 +373,7 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
}
}
#[async_trait::async_trait]
impl RemoteStorage for S3Bucket {
async fn list(
&self,
@@ -625,208 +569,64 @@ impl RemoteStorage for S3Bucket {
delete_objects.push(obj_id);
}
self.delete_oids(kind, &delete_objects).await
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
let started_at = start_measuring_requests(kind);
let resp = self
.client
.delete_objects()
.bucket(self.bucket_name.clone())
.delete(
Delete::builder()
.set_objects(Some(chunk.to_vec()))
.build()?,
)
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &resp, started_at);
match resp {
Ok(resp) => {
metrics::BUCKET_METRICS
.deleted_objects_total
.inc_by(chunk.len() as u64);
if let Some(errors) = resp.errors {
// Log a bounded number of the errors within the response:
// these requests can carry 1000 keys so logging each one
// would be too verbose, especially as errors may lead us
// to retry repeatedly.
const LOG_UP_TO_N_ERRORS: usize = 10;
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
tracing::warn!(
"DeleteObjects key {} failed: {}: {}",
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
e.message.as_ref().map(Cow::from).unwrap_or("".into())
);
}
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
));
}
}
Err(e) => {
return Err(e.into());
}
}
}
Ok(())
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let paths = std::array::from_ref(path);
self.delete_objects(paths).await
}
async fn time_travel_recover(
&self,
prefix: Option<&RemotePath>,
timestamp: SystemTime,
done_if_after: SystemTime,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let kind = RequestKind::TimeTravel;
let _guard = self.permit(kind).await;
let timestamp = DateTime::from(timestamp);
let done_if_after = DateTime::from(done_if_after);
tracing::info!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
// get the passed prefix or if it is not set use prefix_in_bucket value
let prefix = prefix
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| self.prefix_in_bucket.clone());
let warn_threshold = 3;
let max_retries = 10;
let is_permanent = |_e: &_| false;
let mut key_marker = None;
let mut version_id_marker = None;
let mut versions_and_deletes = Vec::new();
loop {
let response = backoff::retry(
|| async {
Ok(self
.client
.list_object_versions()
.bucket(self.bucket_name.clone())
.set_prefix(prefix.clone())
.set_key_marker(key_marker.clone())
.set_version_id_marker(version_id_marker.clone())
.send()
.await?)
},
is_permanent,
warn_threshold,
max_retries,
"listing object versions for time_travel_recover",
backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
)
.await?;
tracing::trace!(
" Got List response version_id_marker={:?}, key_marker={:?}",
response.version_id_marker,
response.key_marker
);
let versions = response
.versions
.unwrap_or_default()
.into_iter()
.map(VerOrDelete::from_version);
let deletes = response
.delete_markers
.unwrap_or_default()
.into_iter()
.map(VerOrDelete::from_delete_marker);
itertools::process_results(versions.chain(deletes), |n_vds| {
versions_and_deletes.extend(n_vds)
})?;
fn none_if_empty(v: Option<String>) -> Option<String> {
v.filter(|v| !v.is_empty())
}
version_id_marker = none_if_empty(response.next_version_id_marker);
key_marker = none_if_empty(response.next_key_marker);
if version_id_marker.is_none() {
// The final response is not supposed to be truncated
if response.is_truncated.unwrap_or_default() {
anyhow::bail!(
"Received truncated ListObjectVersions response for prefix={prefix:?}"
);
}
break;
}
// Limit the number of versions deletions, mostly so that we don't
// keep requesting forever if the list is too long, as we'd put the
// list in RAM.
// Building a list of 100k entries that reaches the limit roughly takes
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
const COMPLEXITY_LIMIT: usize = 100_000;
if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
anyhow::bail!(
"Limit for number of versions/deletions exceeded for prefix={prefix:?}"
);
}
}
// Work on the list of references instead of the objects directly,
// otherwise we get lifetime errors in the sort_by_key call below.
let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
let mut vds_for_key = HashMap::<_, Vec<_>>::new();
for vd in &versions_and_deletes {
let VerOrDelete {
version_id, key, ..
} = &vd;
if version_id == "null" {
anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
indicating either disabled versioning, or legacy objects with null version id values");
}
tracing::trace!(
"Parsing version key={key} version_id={version_id} kind={:?}",
vd.kind
);
vds_for_key.entry(key).or_default().push(vd);
}
for (key, versions) in vds_for_key {
let last_vd = versions.last().unwrap();
if last_vd.last_modified > done_if_after {
tracing::trace!("Key {key} has version later than done_if_after, skipping");
continue;
}
// the version we want to restore to.
let version_to_restore_to =
match versions.binary_search_by_key(&timestamp, |tpl| tpl.last_modified) {
Ok(v) => v,
Err(e) => e,
};
if version_to_restore_to == versions.len() {
tracing::trace!("Key {key} has no changes since timestamp, skipping");
continue;
}
let mut do_delete = false;
if version_to_restore_to == 0 {
// All versions more recent, so the key didn't exist at the specified time point.
tracing::trace!(
"All {} versions more recent for {key}, deleting",
versions.len()
);
do_delete = true;
} else {
match &versions[version_to_restore_to - 1] {
VerOrDelete {
kind: VerOrDeleteKind::Version,
version_id,
..
} => {
tracing::trace!("Copying old version {version_id} for {key}...");
// Restore the state to the last version by copying
let source_id =
format!("{}/{key}?versionId={version_id}", self.bucket_name);
backoff::retry(
|| async {
Ok(self
.client
.copy_object()
.bucket(self.bucket_name.clone())
.key(key)
.copy_source(&source_id)
.send()
.await?)
},
is_permanent,
warn_threshold,
max_retries,
"listing object versions for time_travel_recover",
backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
)
.await?;
}
VerOrDelete {
kind: VerOrDeleteKind::DeleteMarker,
..
} => {
do_delete = true;
}
}
};
if do_delete {
if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
// Key has since been deleted (but there was some history), no need to do anything
tracing::trace!("Key {key} already deleted, skipping.");
} else {
tracing::trace!("Deleting {key}...");
let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
self.delete_oids(kind, &[oid]).await?;
}
}
}
Ok(())
}
}
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
@@ -851,62 +651,6 @@ fn start_measuring_requests(
})
}
// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
struct VerOrDelete {
kind: VerOrDeleteKind,
last_modified: DateTime,
version_id: String,
key: String,
}
#[derive(Debug)]
enum VerOrDeleteKind {
Version,
DeleteMarker,
}
impl VerOrDelete {
fn with_kind(
kind: VerOrDeleteKind,
last_modified: Option<DateTime>,
version_id: Option<String>,
key: Option<String>,
) -> anyhow::Result<Self> {
let lvk = (last_modified, version_id, key);
let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
anyhow::bail!(
"One (or more) of last_modified, key, and id is None. \
Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
lvk.0,
lvk.1,
lvk.2,
);
};
Ok(Self {
kind,
last_modified,
version_id,
key,
})
}
fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
Self::with_kind(
VerOrDeleteKind::Version,
v.last_modified,
v.version_id,
v.key,
)
}
fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
Self::with_kind(
VerOrDeleteKind::DeleteMarker,
v.last_modified,
v.version_id,
v.key,
)
}
}
#[cfg(test)]
mod tests {
use camino::Utf8Path;

View File

@@ -12,7 +12,6 @@ pub(crate) enum RequestKind {
Delete = 2,
List = 3,
Copy = 4,
TimeTravel = 5,
}
use RequestKind::*;
@@ -25,7 +24,6 @@ impl RequestKind {
Delete => "delete_object",
List => "list_objects",
Copy => "copy_object",
TimeTravel => "time_travel_recover",
}
}
const fn as_index(&self) -> usize {
@@ -33,7 +31,7 @@ impl RequestKind {
}
}
pub(super) struct RequestTyped<C>([C; 6]);
pub(super) struct RequestTyped<C>([C; 5]);
impl<C> RequestTyped<C> {
pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -42,8 +40,8 @@ impl<C> RequestTyped<C> {
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
use RequestKind::*;
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
let arr = std::array::from_fn::<C, 6, _>(|index| {
let mut it = [Get, Put, Delete, List, Copy].into_iter();
let arr = std::array::from_fn::<C, 5, _>(|index| {
let next = it.next().unwrap();
assert_eq!(index, next.as_index());
f(next)

View File

@@ -3,19 +3,16 @@
//! testing purposes.
use bytes::Bytes;
use futures::stream::Stream;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Mutex;
use std::time::SystemTime;
use std::{collections::hash_map::Entry, sync::Arc};
use tokio_util::sync::CancellationToken;
use crate::{
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
StorageMetadata,
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
};
pub struct UnreliableWrapper {
inner: GenericRemoteStorage<Arc<VoidStorage>>,
inner: crate::GenericRemoteStorage,
// This many attempts of each operation will fail, then we let it succeed.
attempts_to_fail: u64,
@@ -32,21 +29,11 @@ enum RemoteOp {
Download(RemotePath),
Delete(RemotePath),
DeleteObjects(Vec<RemotePath>),
TimeTravelRecover(Option<RemotePath>),
}
impl UnreliableWrapper {
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
assert!(attempts_to_fail > 0);
let inner = match inner {
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
// We could also make this a no-op, as in, extract the inner of the passed generic remote storage
GenericRemoteStorage::Unreliable(_s) => {
panic!("Can't wrap unreliable wrapper unreliably")
}
};
UnreliableWrapper {
inner,
attempts_to_fail,
@@ -97,9 +84,7 @@ impl UnreliableWrapper {
}
}
// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
type VoidStorage = crate::LocalFs;
#[async_trait::async_trait]
impl RemoteStorage for UnreliableWrapper {
async fn list_prefixes(
&self,
@@ -184,17 +169,4 @@ impl RemoteStorage for UnreliableWrapper {
self.attempt(RemoteOp::Upload(to.clone()))?;
self.inner.copy_object(from, to).await
}
async fn time_travel_recover(
&self,
prefix: Option<&RemotePath>,
timestamp: SystemTime,
done_if_after: SystemTime,
cancel: CancellationToken,
) -> anyhow::Result<()> {
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
self.inner
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
}

View File

@@ -1,21 +1,15 @@
use std::collections::HashSet;
use std::env;
use std::fmt::{Debug, Display};
use std::num::NonZeroUsize;
use std::ops::ControlFlow;
use std::sync::Arc;
use std::time::{Duration, UNIX_EPOCH};
use std::{collections::HashSet, time::SystemTime};
use std::time::UNIX_EPOCH;
use crate::common::{download_to_vec, upload_stream};
use anyhow::Context;
use camino::Utf8Path;
use futures_util::Future;
use remote_storage::{
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
};
use test_context::test_context;
use test_context::AsyncTestContext;
use tokio_util::sync::CancellationToken;
use tracing::info;
mod common;
@@ -24,160 +18,11 @@ mod common;
mod tests_s3;
use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
use utils::backoff;
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
const BASE_PREFIX: &str = "test";
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledStorage::Enabled(ctx) => ctx,
MaybeEnabledStorage::Disabled => return Ok(()),
};
// Our test depends on discrepancies in the clock between S3 and the environment the tests
// run in. Therefore, wait a little bit before and after. The alternative would be
// to take the time from S3 response headers.
const WAIT_TIME: Duration = Duration::from_millis(3_000);
async fn retry<T, O, F, E>(op: O) -> Result<T, E>
where
E: Display + Debug + 'static,
O: FnMut() -> F,
F: Future<Output = Result<T, E>>,
{
let warn_threshold = 3;
let max_retries = 10;
backoff::retry(
op,
|_e| false,
warn_threshold,
max_retries,
"test retry",
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await
}
async fn time_point() -> SystemTime {
tokio::time::sleep(WAIT_TIME).await;
let ret = SystemTime::now();
tokio::time::sleep(WAIT_TIME).await;
ret
}
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
Ok(retry(|| client.list_files(None))
.await
.context("list root files failure")?
.into_iter()
.collect::<HashSet<_>>())
}
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
retry(|| {
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
ctx.client.upload(data, len, &path1, None)
})
.await?;
let t0_files = list_files(&ctx.client).await?;
let t0 = time_point().await;
println!("at t0: {t0_files:?}");
let old_data = "remote blob data2";
retry(|| {
let (data, len) = upload_stream(old_data.as_bytes().into());
ctx.client.upload(data, len, &path2, None)
})
.await?;
let t1_files = list_files(&ctx.client).await?;
let t1 = time_point().await;
println!("at t1: {t1_files:?}");
// A little check to ensure that our clock is not too far off from the S3 clock
{
let dl = retry(|| ctx.client.download(&path2)).await?;
let last_modified = dl.last_modified.unwrap();
let half_wt = WAIT_TIME.mul_f32(0.5);
let t0_hwt = t0 + half_wt;
let t1_hwt = t1 - half_wt;
if !(t0_hwt..=t1_hwt).contains(&last_modified) {
panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
This likely means a large lock discrepancy between S3 and the local clock.");
}
}
retry(|| {
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
ctx.client.upload(data, len, &path3, None)
})
.await?;
let new_data = "new remote blob data2";
retry(|| {
let (data, len) = upload_stream(new_data.as_bytes().into());
ctx.client.upload(data, len, &path2, None)
})
.await?;
retry(|| ctx.client.delete(&path1)).await?;
let t2_files = list_files(&ctx.client).await?;
let t2 = time_point().await;
println!("at t2: {t2_files:?}");
// No changes after recovery to t2 (no-op)
let t_final = time_point().await;
ctx.client
.time_travel_recover(None, t2, t_final, CancellationToken::new())
.await?;
let t2_files_recovered = list_files(&ctx.client).await?;
println!("after recovery to t2: {t2_files_recovered:?}");
assert_eq!(t2_files, t2_files_recovered);
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
assert_eq!(path2_recovered_t2, new_data.as_bytes());
// after recovery to t1: path1 is back, path2 has the old content
let t_final = time_point().await;
ctx.client
.time_travel_recover(None, t1, t_final, CancellationToken::new())
.await?;
let t1_files_recovered = list_files(&ctx.client).await?;
println!("after recovery to t1: {t1_files_recovered:?}");
assert_eq!(t1_files, t1_files_recovered);
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
assert_eq!(path2_recovered_t1, old_data.as_bytes());
// after recovery to t0: everything is gone except for path1
let t_final = time_point().await;
ctx.client
.time_travel_recover(None, t0, t_final, CancellationToken::new())
.await?;
let t0_files_recovered = list_files(&ctx.client).await?;
println!("after recovery to t0: {t0_files_recovered:?}");
assert_eq!(t0_files, t0_files_recovered);
// cleanup
let paths = &[path1, path2, path3];
retry(|| ctx.client.delete_objects(paths)).await?;
Ok(())
}
struct EnabledS3 {
client: Arc<GenericRemoteStorage>,
base_prefix: &'static str,

View File

@@ -112,55 +112,6 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
tokio::fs::File::open(path.as_ref()).await?.sync_all().await
}
pub async fn fsync_async_opt(
path: impl AsRef<Utf8Path>,
do_fsync: bool,
) -> Result<(), std::io::Error> {
if do_fsync {
fsync_async(path.as_ref()).await?;
}
Ok(())
}
/// Like postgres' durable_rename, renames file issuing fsyncs do make it
/// durable. After return, file and rename are guaranteed to be persisted.
///
/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
/// contents durable; 2) its directory entry to make rename durable 3) again to
/// already renamed file, which is not required by standards but postgres does
/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
/// rename if it exists to ensure that at least one of the files survives, but
/// current callers don't need that.
///
/// virtual_file.rs has similar code, but it doesn't use vfs.
///
/// Useful links: <https://lwn.net/Articles/457667/>
/// <https://www.postgresql.org/message-id/flat/56583BDD.9060302%402ndquadrant.com>
/// <https://thunk.org/tytso/blog/2009/03/15/dont-fear-the-fsync/>
pub async fn durable_rename(
old_path: impl AsRef<Utf8Path>,
new_path: impl AsRef<Utf8Path>,
do_fsync: bool,
) -> io::Result<()> {
// first fsync the file
fsync_async_opt(old_path.as_ref(), do_fsync).await?;
// Time to do the real deal.
tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
// Postgres'ish fsync of renamed file.
fsync_async_opt(new_path.as_ref(), do_fsync).await?;
// Now fsync the parent
let parent = match new_path.as_ref().parent() {
Some(p) => p,
None => Utf8Path::new("./"), // assume current dir if there is no parent
};
fsync_async_opt(parent, do_fsync).await?;
Ok(())
}
#[cfg(test)]
mod tests {

View File

@@ -131,9 +131,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
_ => info!("Error processing HTTP request: {api_error:#}"),
_ => error!("Error processing HTTP request: {api_error:#}"),
}
api_error.into_response()

View File

@@ -5,10 +5,10 @@ use std::os::unix::io::RawFd;
pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
let bits = fcntl(fd, F_GETFL)?;
// If F_GETFL returns some unknown bits, they should be valid
// Safety: If F_GETFL returns some unknown bits, they should be valid
// for passing back to F_SETFL, too. If we left them out, the F_SETFL
// would effectively clear them, which is not what we want.
let mut flags = OFlag::from_bits_retain(bits);
let mut flags = unsafe { OFlag::from_bits_unchecked(bits) };
flags |= OFlag::O_NONBLOCK;
fcntl(fd, F_SETFL(flags))?;

View File

@@ -1,10 +1,4 @@
use std::{
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
time::Duration,
};
use std::{sync::Arc, time::Duration};
/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
///
@@ -12,70 +6,62 @@ use std::{
/// the resource calls `close()` when they want to ensure that all holders of guards
/// have released them, and that no future guards will be issued.
pub struct Gate {
inner: Arc<GateInner>,
/// Each caller of enter() takes one unit from the semaphore. In close(), we
/// take all the units to ensure all GateGuards are destroyed.
sem: Arc<tokio::sync::Semaphore>,
/// For observability only: a name that will be used to log warnings if a particular
/// gate is holding up shutdown
name: String,
}
impl std::fmt::Debug for Gate {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Gate")
// use this for identification
.field("ptr", &Arc::as_ptr(&self.inner))
.field("inner", &self.inner)
.finish()
}
}
struct GateInner {
sem: tokio::sync::Semaphore,
closing: std::sync::atomic::AtomicBool,
}
impl std::fmt::Debug for GateInner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let avail = self.sem.available_permits();
let guards = u32::try_from(avail)
.ok()
// the sem only supports 32-bit ish amount, but lets play it safe
.and_then(|x| Gate::MAX_UNITS.checked_sub(x));
let closing = self.closing.load(Ordering::Relaxed);
if let Some(guards) = guards {
f.debug_struct("Gate")
.field("remaining_guards", &guards)
.field("closing", &closing)
.finish()
} else {
f.debug_struct("Gate")
.field("avail_permits", &avail)
.field("closing", &closing)
.finish()
}
write!(f, "Gate<{}>", self.name)
}
}
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
/// not complete.
#[derive(Debug)]
pub struct GateGuard {
// Record the span where the gate was entered, so that we can identify who was blocking Gate::close
span_at_enter: tracing::Span,
gate: Arc<GateInner>,
}
pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
impl Drop for GateGuard {
fn drop(&mut self) {
if self.gate.closing.load(Ordering::Relaxed) {
self.span_at_enter.in_scope(
|| tracing::info!(gate = ?Arc::as_ptr(&self.gate), "kept the gate from closing"),
);
/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
async fn warn_if_stuck<Fut: std::future::Future>(
fut: Fut,
name: &str,
warn_period: std::time::Duration,
) -> <Fut as std::future::Future>::Output {
let started = std::time::Instant::now();
let mut fut = std::pin::pin!(fut);
let mut warned = false;
let ret = loop {
match tokio::time::timeout(warn_period, &mut fut).await {
Ok(ret) => break ret,
Err(_) => {
tracing::warn!(
gate = name,
elapsed_ms = started.elapsed().as_millis(),
"still waiting, taking longer than expected..."
);
warned = true;
}
}
};
// when the permit was acquired, it was forgotten to allow us to manage it's lifecycle
// manually, so "return" the permit now.
self.gate.sem.add_permits(1);
// If we emitted a warning for slowness, also emit a message when we complete, so that
// someone debugging a shutdown can know for sure whether we have moved past this operation.
if warned {
tracing::info!(
gate = name,
elapsed_ms = started.elapsed().as_millis(),
"completed, after taking longer than expected"
)
}
ret
}
#[derive(Debug)]
@@ -83,20 +69,16 @@ pub enum GateError {
GateClosed,
}
impl Default for Gate {
fn default() -> Self {
Self {
inner: Arc::new(GateInner {
sem: tokio::sync::Semaphore::new(Self::MAX_UNITS as usize),
closing: AtomicBool::new(false),
}),
}
}
}
impl Gate {
const MAX_UNITS: u32 = u32::MAX;
pub fn new(name: String) -> Self {
Self {
sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
name,
}
}
/// Acquire a guard that will prevent close() calls from completing. If close()
/// was already called, this will return an error which should be interpreted
/// as "shutting down".
@@ -106,23 +88,11 @@ impl Gate {
/// to avoid blocking close() indefinitely: typically types that contain a Gate will
/// also contain a CancellationToken.
pub fn enter(&self) -> Result<GateGuard, GateError> {
let permit = self
.inner
.sem
.try_acquire()
.map_err(|_| GateError::GateClosed)?;
// we now have the permit, let's disable the normal raii functionality and leave
// "returning" the permit to our GateGuard::drop.
//
// this is done to avoid the need for multiple Arcs (one for semaphore, next for other
// fields).
permit.forget();
Ok(GateGuard {
span_at_enter: tracing::Span::current(),
gate: self.inner.clone(),
})
self.sem
.clone()
.try_acquire_owned()
.map(GateGuard)
.map_err(|_| GateError::GateClosed)
}
/// Types with a shutdown() method and a gate should call this method at the
@@ -132,88 +102,48 @@ impl Gate {
/// important that the holders of such guards are respecting a CancellationToken which has
/// been cancelled before entering this function.
pub async fn close(&self) {
let started_at = std::time::Instant::now();
let mut do_close = std::pin::pin!(self.do_close());
let nag_after = Duration::from_secs(1);
let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
return;
};
tracing::info!(
gate = ?self.as_ptr(),
elapsed_ms = started_at.elapsed().as_millis(),
"closing is taking longer than expected"
);
// close operation is not trying to be cancellation safe as pageserver does not need it.
//
// note: "closing" is not checked in Gate::enter -- it exists just for observability,
// dropping of GateGuard after this will log who they were.
self.inner.closing.store(true, Ordering::Relaxed);
do_close.await;
tracing::info!(
gate = ?self.as_ptr(),
elapsed_ms = started_at.elapsed().as_millis(),
"close completed"
);
}
/// Used as an identity of a gate. This identity will be resolved to something useful when
/// it's actually closed in a hopefully sensible `tracing::Span` which will describe it even
/// more.
///
/// `GateGuard::drop` also logs this pointer when it has realized it has been keeping the gate
/// open for too long.
fn as_ptr(&self) -> *const GateInner {
Arc::as_ptr(&self.inner)
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
}
/// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This
/// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
/// the CancellationToken on such types is analogous to "Did shutdown start?"
pub fn close_complete(&self) -> bool {
self.inner.sem.is_closed()
self.sem.is_closed()
}
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(gate = ?self.as_ptr()))]
async fn do_close(&self) {
tracing::debug!("Closing Gate...");
match self.inner.sem.acquire_many(Self::MAX_UNITS).await {
Ok(_permit) => {
tracing::debug!(gate = self.name, "Closing Gate...");
match self.sem.acquire_many(Self::MAX_UNITS).await {
Ok(_units) => {
// While holding all units, close the semaphore. All subsequent calls to enter() will fail.
self.inner.sem.close();
self.sem.close();
}
Err(_closed) => {
Err(_) => {
// Semaphore closed: we are the only function that can do this, so it indicates a double-call.
// This is legal. Timeline::shutdown for example is not protected from being called more than
// once.
tracing::debug!("Double close")
tracing::debug!(gate = self.name, "Double close")
}
}
tracing::debug!("Closed Gate.")
tracing::debug!(gate = self.name, "Closed Gate.")
}
}
#[cfg(test)]
mod tests {
use futures::FutureExt;
use super::*;
#[tokio::test]
async fn close_unused() {
// Having taken no guards, we should not be blocked in close
let gate = Gate::default();
async fn test_idle_gate() {
// Having taken no gates, we should not be blocked in close
let gate = Gate::new("test".to_string());
gate.close().await;
}
#[tokio::test]
async fn close_idle() {
// If a guard is dropped before entering, close should not be blocked
let gate = Gate::default();
let gate = Gate::new("test".to_string());
let guard = gate.enter().unwrap();
drop(guard);
gate.close().await;
@@ -222,30 +152,25 @@ mod tests {
gate.enter().expect_err("enter should fail after close");
}
#[tokio::test(start_paused = true)]
async fn close_busy_gate() {
let gate = Gate::default();
let forever = Duration::from_secs(24 * 7 * 365);
#[tokio::test]
async fn test_busy_gate() {
let gate = Gate::new("test".to_string());
let guard =
tracing::info_span!("i am holding back the gate").in_scope(|| gate.enter().unwrap());
let guard = gate.enter().unwrap();
let mut close_fut = std::pin::pin!(gate.close());
// Close should be waiting for guards to drop
tokio::time::timeout(forever, &mut close_fut)
.await
.unwrap_err();
// Close should be blocked
assert!(close_fut.as_mut().now_or_never().is_none());
// Attempting to enter() should fail, even though close isn't done yet.
gate.enter()
.expect_err("enter should fail after entering close");
// this will now log, which we cannot verify except manually
drop(guard);
// Guard is gone, close should finish
close_fut.await;
assert!(close_fut.as_mut().now_or_never().is_some());
// Attempting to enter() is still forbidden
gate.enter().expect_err("enter should fail finishing close");

View File

@@ -1,6 +1,7 @@
use std::{
io,
net::{TcpListener, ToSocketAddrs},
os::unix::prelude::AsRawFd,
};
use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
@@ -9,7 +10,7 @@ use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
pub fn bind<A: ToSocketAddrs>(addr: A) -> io::Result<TcpListener> {
let listener = TcpListener::bind(addr)?;
setsockopt(&listener, ReuseAddr, &true)?;
setsockopt(listener.as_raw_fd(), ReuseAddr, &true)?;
Ok(listener)
}

View File

@@ -21,6 +21,7 @@ camino.workspace = true
camino-tempfile.workspace = true
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["string"] }
close_fds.workspace = true
const_format.workspace = true
consumption_metrics.workspace = true
crc32c.workspace = true
@@ -60,7 +61,6 @@ sync_wrapper.workspace = true
tokio-tar.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
tokio-epoll-uring.workspace = true
tokio-io-timeout.workspace = true
tokio-postgres.workspace = true
tokio-stream.workspace = true

View File

@@ -69,25 +69,6 @@ impl Client {
resp.json().await.map_err(Error::ReceiveBody)
}
/// Get an arbitrary path and returning a streaming Response. This function is suitable
/// for pass-through/proxy use cases where we don't care what the response content looks
/// like.
///
/// Use/add one of the properly typed methods below if you know aren't proxying, and
/// know what kind of response you expect.
pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
debug_assert!(path.starts_with('/'));
let uri = format!("{}{}", self.mgmt_api_endpoint, path);
let req = self.client.request(Method::GET, uri);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
};
req.send().await.map_err(Error::ReceiveBody)
}
pub async fn tenant_details(
&self,
tenant_shard_id: TenantShardId,
@@ -190,25 +171,6 @@ impl Client {
.map_err(Error::ReceiveBody)
}
/// The tenant deletion API can return 202 if deletion is incomplete, or
/// 404 if it is complete. Callers are responsible for checking the status
/// code and retrying. Error codes other than 404 will return Err().
pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
match self.request(Method::DELETE, &uri, ()).await {
Err(Error::ApiError(status_code, msg)) => {
if status_code == StatusCode::NOT_FOUND {
Ok(StatusCode::NOT_FOUND)
} else {
Err(Error::ApiError(status_code, msg))
}
}
Err(e) => Err(e),
Ok(response) => Ok(response.status()),
}
}
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
self.request(Method::PUT, &uri, req).await?;
@@ -272,32 +234,6 @@ impl Client {
.map_err(Error::ReceiveBody)
}
/// The timeline deletion API can return 201 if deletion is incomplete, or
/// 403 if it is complete. Callers are responsible for checking the status
/// code and retrying. Error codes other than 403 will return Err().
pub async fn timeline_delete(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> Result<StatusCode> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
self.mgmt_api_endpoint
);
match self.request(Method::DELETE, &uri, ()).await {
Err(Error::ApiError(status_code, msg)) => {
if status_code == StatusCode::NOT_FOUND {
Ok(StatusCode::NOT_FOUND)
} else {
Err(Error::ApiError(status_code, msg))
}
}
Err(e) => Err(e),
Ok(response) => Ok(response.status()),
}
}
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{}/reset",

View File

@@ -156,8 +156,7 @@ impl PagestreamClient {
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
PagestreamBeMessage::Exists(_)
| PagestreamBeMessage::Nblocks(_)
| PagestreamBeMessage::DbSize(_)
| PagestreamBeMessage::GetSlruSegment(_) => {
| PagestreamBeMessage::DbSize(_) => {
anyhow::bail!(
"unexpected be message kind in response to getpage request: {}",
msg.kind()

View File

@@ -18,7 +18,7 @@ use pageserver::tenant::block_io::FileBlockReader;
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
use pageserver::tenant::storage_layer::range_overlaps;
use pageserver::virtual_file::{self, VirtualFile};
use pageserver::virtual_file::VirtualFile;
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
pageserver::virtual_file::init(10);
pageserver::page_cache::init(100);
let mut total_delta_layers = 0usize;

View File

@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
virtual_file::init(10);
page_cache::init(100);
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let summary_blk = file.read_blk(0, ctx).await?;
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
new_tenant_id,
new_timeline_id,
} => {
pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
pageserver::virtual_file::init(10);
pageserver::page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

View File

@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
// Basic initialization of things that don't change after startup
virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
virtual_file::init(10);
page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
dump_layerfile_from_path(path, true, &ctx).await

View File

@@ -423,8 +423,8 @@ async fn client(
tokio::select! {
res = do_requests => { res },
_ = cancel.cancelled() => {
// fallthrough to shutdown
client.shutdown().await;
return;
}
}
client.shutdown().await;
}

View File

@@ -66,10 +66,13 @@ impl serde::Serialize for LatencyPercentiles {
{
use serde::ser::SerializeMap;
let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
for (p, v) in LATENCY_PERCENTILES.iter().zip(&self.latency_percentiles) {
for p in LATENCY_PERCENTILES {
ser.serialize_entry(
&format!("p{p}"),
&format!("{}", humantime::format_duration(*v)),
&format!(
"{}",
&humantime::format_duration(self.latency_percentiles[0])
),
)?;
}
ser.end()

View File

@@ -11,9 +11,8 @@
//! from data stored in object storage.
//!
use anyhow::{anyhow, bail, ensure, Context};
use bytes::{BufMut, Bytes, BytesMut};
use bytes::{BufMut, BytesMut};
use fail::fail_point;
use pageserver_api::key::{key_to_slru_block, Key};
use postgres_ffi::pg_constants;
use std::fmt::Write as FmtWrite;
use std::time::SystemTime;
@@ -134,87 +133,6 @@ where
ctx: &'a RequestContext,
}
/// A sink that accepts SLRU blocks ordered by key and forwards
/// full segments to the archive.
struct SlruSegmentsBuilder<'a, 'b, W>
where
W: AsyncWrite + Send + Sync + Unpin,
{
ar: &'a mut Builder<&'b mut W>,
buf: Vec<u8>,
current_segment: Option<(SlruKind, u32)>,
}
impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
where
W: AsyncWrite + Send + Sync + Unpin,
{
fn new(ar: &'a mut Builder<&'b mut W>) -> Self {
Self {
ar,
buf: Vec::new(),
current_segment: None,
}
}
async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
let (kind, segno, _) = key_to_slru_block(*key)?;
match kind {
SlruKind::Clog => {
ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
}
SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
ensure!(block.len() == BLCKSZ as usize);
}
}
let segment = (kind, segno);
match self.current_segment {
None => {
self.current_segment = Some(segment);
self.buf
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
}
Some(current_seg) if current_seg == segment => {
self.buf
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
}
Some(_) => {
self.flush().await?;
self.current_segment = Some(segment);
self.buf
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
}
}
Ok(())
}
async fn flush(&mut self) -> anyhow::Result<()> {
let nblocks = self.buf.len() / BLCKSZ as usize;
let (kind, segno) = self.current_segment.take().unwrap();
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
let header = new_tar_header(&segname, self.buf.len() as u64)?;
self.ar.append(&header, self.buf.as_slice()).await?;
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
self.buf.clear();
Ok(())
}
async fn finish(mut self) -> anyhow::Result<()> {
if self.current_segment.is_none() || self.buf.is_empty() {
return Ok(());
}
self.flush().await
}
}
impl<'a, W> Basebackup<'a, W>
where
W: AsyncWrite + Send + Sync + Unpin,
@@ -222,8 +140,6 @@ where
async fn send_tarball(mut self) -> anyhow::Result<()> {
// TODO include checksum
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
// Create pgdata subdirs structure
for dir in PGDATA_SUBDIRS.iter() {
let header = new_tar_header_dir(dir)?;
@@ -250,27 +166,20 @@ where
.context("could not add config file to basebackup tarball")?;
}
}
if !lazy_slru_download {
// Gather non-relational files from object storage pages.
let slru_partitions = self
// Gather non-relational files from object storage pages.
for kind in [
SlruKind::Clog,
SlruKind::MultiXactOffsets,
SlruKind::MultiXactMembers,
] {
for segno in self
.timeline
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
.await?
.partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
for part in slru_partitions.parts {
let blocks = self
.timeline
.get_vectored(&part.ranges, self.lsn, self.ctx)
.await?;
for (key, block) in blocks {
slru_builder.add_block(&key, block?).await?;
}
{
self.add_slru_segment(kind, segno).await?;
}
slru_builder.finish().await?;
}
let mut min_restart_lsn: Lsn = Lsn::MAX;
@@ -396,6 +305,39 @@ where
Ok(())
}
//
// Generate SLRU segment files from repository.
//
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
.await?;
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
for blknum in 0..nblocks {
let img = self
.timeline
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
.await?;
if slru == SlruKind::Clog {
ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
} else {
ensure!(img.len() == BLCKSZ as usize);
}
slru_buf.extend_from_slice(&img[..BLCKSZ as usize]);
}
let segname = format!("{}/{:>04X}", slru.to_str(), segno);
let header = new_tar_header(&segname, slru_buf.len() as u64)?;
self.ar.append(&header, slru_buf.as_slice()).await?;
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
Ok(())
}
//
// Include database/tablespace directories.
//

View File

@@ -33,10 +33,12 @@ use pageserver::{
use postgres_backend::AuthType;
use utils::failpoint_support;
use utils::logging::TracingErrorLayerEnablement;
use utils::signals::ShutdownSignals;
use utils::{
auth::{JwtAuth, SwappableJwtAuth},
logging, project_build_tag, project_git_version,
sentry_init::init_sentry,
signals::Signal,
tcp_listener,
};
@@ -128,7 +130,7 @@ fn main() -> anyhow::Result<()> {
let scenario = failpoint_support::init();
// Basic initialization of things that don't change after startup
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
virtual_file::init(conf.max_file_descriptors);
page_cache::init(conf.page_cache_size);
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
@@ -654,42 +656,34 @@ fn start_pageserver(
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal.
{
use signal_hook::consts::*;
let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
let mut signals =
signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
return signals
.forever()
.next()
.expect("forever() never returns None unless explicitly closed");
});
let signal = BACKGROUND_RUNTIME
.block_on(signal_handler)
.expect("join error");
match signal {
SIGQUIT => {
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
std::process::exit(111);
}
SIGINT | SIGTERM => {
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));
unreachable!()
}
_ => unreachable!(),
ShutdownSignals::handle(|signal| match signal {
Signal::Quit => {
info!(
"Got {}. Terminating in immediate shutdown mode",
signal.name()
);
std::process::exit(111);
}
}
Signal::Interrupt | Signal::Terminate => {
info!(
"Got {}. Terminating gracefully in fast shutdown mode",
signal.name()
);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));
unreachable!()
}
})
}
fn create_remote_storage_client(

View File

@@ -36,7 +36,6 @@ use crate::tenant::config::TenantConfOpt;
use crate::tenant::{
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
};
use crate::virtual_file;
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
@@ -44,8 +43,6 @@ use crate::{
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
pub mod defaults {
use crate::tenant::config::defaults::*;
use const_format::formatcp;
@@ -82,8 +79,6 @@ pub mod defaults {
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
///
/// Default built-in configuration file.
///
@@ -119,8 +114,6 @@ pub mod defaults {
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
[tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -254,8 +247,6 @@ pub struct PageServerConf {
/// Maximum number of WAL records to be ingested and committed at the same time
pub ingest_batch_size: u64,
pub virtual_file_io_engine: virtual_file::IoEngineKind,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -340,8 +331,6 @@ struct PageServerConfigBuilder {
secondary_download_concurrency: BuilderValue<usize>,
ingest_batch_size: BuilderValue<u64>,
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
}
impl Default for PageServerConfigBuilder {
@@ -417,8 +406,6 @@ impl Default for PageServerConfigBuilder {
secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
}
}
}
@@ -575,10 +562,6 @@ impl PageServerConfigBuilder {
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
}
pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
self.virtual_file_io_engine = BuilderValue::Set(value);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_warmup = self
.concurrent_tenant_warmup
@@ -686,9 +669,6 @@ impl PageServerConfigBuilder {
ingest_batch_size: self
.ingest_batch_size
.ok_or(anyhow!("missing ingest_batch_size"))?,
virtual_file_io_engine: self
.virtual_file_io_engine
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
})
}
}
@@ -940,9 +920,6 @@ impl PageServerConf {
builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
},
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
"virtual_file_io_engine" => {
builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -1016,7 +993,6 @@ impl PageServerConf {
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
}
}
}
@@ -1249,7 +1225,6 @@ background_task_maximum_delay = '334 s'
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
},
"Correct defaults should be used when no config values are provided"
);
@@ -1313,7 +1288,6 @@ background_task_maximum_delay = '334 s'
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: 100,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -97,86 +97,23 @@ pub enum EvictionOrder {
/// Order the layers to be evicted by how recently they have been accessed relatively within
/// the set of resident layers of a tenant.
///
/// This strategy will evict layers more fairly but is untested.
RelativeAccessed {
/// Determines if the tenant with most layers should lose first.
///
/// Having this enabled is currently the only reasonable option, because the order in which
/// we read tenants is deterministic. If we find the need to use this as `false`, we need
/// to ensure nondeterminism by adding in a random number to break the
/// `relative_last_activity==0.0` ties.
#[serde(default = "default_highest_layer_count_loses_first")]
#[serde(default)]
highest_layer_count_loses_first: bool,
},
}
fn default_highest_layer_count_loses_first() -> bool {
true
}
impl EvictionOrder {
fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
use EvictionOrder::*;
/// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
/// counts should be the first ones to have their layers evicted.
fn highest_layer_count_loses_first(&self) -> bool {
match self {
AbsoluteAccessed => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.last_activity_ts)
});
}
RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
}),
}
}
/// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
/// layers in **most** recently used order.
fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
use EvictionOrder::*;
match self {
AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
RelativeAccessed {
EvictionOrder::AbsoluteAccessed => false,
EvictionOrder::RelativeAccessed {
highest_layer_count_loses_first,
} => {
// keeping the -1 or not decides if every tenant should lose their least recently accessed
// layer OR if this should happen in the order of having highest layer count:
let fudge = if *highest_layer_count_loses_first {
// relative_last_activity vs. tenant layer count:
// - 0.1..=1.0 (10 layers)
// - 0.01..=1.0 (100 layers)
// - 0.001..=1.0 (1000 layers)
//
// leading to evicting less of the smallest tenants.
0
} else {
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
// be that less than 10k layer evictions is enough, so we would not need to evict from
// all tenants.
//
// as the tenant ordering is now deterministic this could hit the same tenants
// disproportionetly on multiple invocations. alternative could be to remember how many
// layers did we evict last time from this tenant, and inject that as an additional
// fudge here.
1
};
let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
let divider = total as f32;
// most recently used is always (total - 0) / divider == 1.0
// least recently used depends on the fudge:
// - (total - 1) - (total - 1) / total => 0 / total
// - total - (total - 1) / total => 1 / total
let distance = (total - index) as f32;
finite_f32::FiniteF32::try_from_normalized(distance / divider)
.unwrap_or_else(|val| {
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
finite_f32::FiniteF32::ZERO
})
}
} => *highest_layer_count_loses_first,
}
}
}
@@ -452,6 +389,52 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
let selection = select_victims(&candidates, usage_pre);
let mut candidates = candidates;
let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
// we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
// for comparison here. this is a temporary measure to develop alternatives.
use std::fmt::Write;
let mut summary_buf = String::with_capacity(256);
{
let absolute_summary = candidates
.iter()
.take(selection.amount)
.map(|(_, candidate)| candidate)
.collect::<summary::EvictionSummary>();
write!(summary_buf, "{absolute_summary}").expect("string grows");
info!("absolute accessed selection summary: {summary_buf}");
}
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
});
let selection = select_victims(&candidates, usage_pre);
{
summary_buf.clear();
let relative_summary = candidates
.iter()
.take(selection.amount)
.map(|(_, candidate)| candidate)
.collect::<summary::EvictionSummary>();
write!(summary_buf, "{relative_summary}").expect("string grows");
info!("relative accessed selection summary: {summary_buf}");
}
selection
} else {
selection
};
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
// phase2: evict layers
@@ -852,12 +835,54 @@ async fn collect_eviction_candidates(
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
let mut cumsum: i128 = 0;
let total = tenant_candidates.len();
// keeping the -1 or not decides if every tenant should lose their least recently accessed
// layer OR if this should happen in the order of having highest layer count:
let fudge = if eviction_order.highest_layer_count_loses_first() {
// relative_age vs. tenant layer count:
// - 0.1..=1.0 (10 layers)
// - 0.01..=1.0 (100 layers)
// - 0.001..=1.0 (1000 layers)
//
// leading to evicting less of the smallest tenants.
0
} else {
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
// be that less than 10k layer evictions is enough, so we would not need to evict from
// all tenants.
//
// as the tenant ordering is now deterministic this could hit the same tenants
// disproportionetly on multiple invocations. alternative could be to remember how many
// layers did we evict last time from this tenant, and inject that as an additional
// fudge here.
1
};
let total = tenant_candidates
.len()
.checked_sub(fudge)
.filter(|&x| x > 0)
// support 0 or 1 resident layer tenants as well
.unwrap_or(1);
let divider = total as f32;
for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
// as we iterate this reverse sorted list, the most recently accessed layer will always
// be 1.0; this is for us to evict it last.
candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
candidate.relative_last_activity = if matches!(
eviction_order,
EvictionOrder::RelativeAccessed { .. }
) {
// another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
// similarly for u16. unsure how it would help.
finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
.unwrap_or_else(|val| {
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
finite_f32::FiniteF32::ZERO
})
} else {
finite_f32::FiniteF32::ZERO
};
let partition = if cumsum > min_resident_size as i128 {
MinResidentSizePartition::Above
@@ -902,7 +927,10 @@ async fn collect_eviction_candidates(
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
eviction_order.sort(&mut candidates);
// always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
// will sort later by candidate.relative_last_activity to get compare evictions.
candidates
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
Ok(EvictionCandidates::Finished(candidates))
}
@@ -1042,12 +1070,6 @@ pub(crate) mod finite_f32 {
}
}
impl From<FiniteF32> for f32 {
fn from(value: FiniteF32) -> f32 {
value.0
}
}
impl FiniteF32 {
pub const ZERO: FiniteF32 = FiniteF32(0.0);
@@ -1060,9 +1082,136 @@ pub(crate) mod finite_f32 {
Err(value)
}
}
}
}
pub fn into_inner(self) -> f32 {
self.into()
mod summary {
use super::finite_f32::FiniteF32;
use super::{EvictionCandidate, LayerCount};
use pageserver_api::shard::TenantShardId;
use std::collections::{BTreeMap, HashMap};
use std::time::SystemTime;
#[derive(Debug, Default)]
pub(super) struct EvictionSummary {
evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
total: LayerCount,
last_absolute: Option<SystemTime>,
last_relative: Option<FiniteF32>,
}
impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
let mut summary = EvictionSummary::default();
for item in iter {
let counts = summary
.evicted_per_tenant
.entry(*item.layer.get_tenant_shard_id())
.or_default();
let sz = item.layer.get_file_size();
counts.file_sizes += sz;
counts.count += 1;
summary.total.file_sizes += sz;
summary.total.count += 1;
summary.last_absolute = Some(item.last_activity_ts);
summary.last_relative = Some(item.relative_last_activity);
}
summary
}
}
struct SiBytesAmount(u64);
impl std::fmt::Display for SiBytesAmount {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.0 < 1024 {
return write!(f, "{}B", self.0);
}
let mut tmp = self.0;
let mut ch = 0;
let suffixes = b"KMGTPE";
while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
tmp /= 1024;
ch += 1;
}
let ch = suffixes[ch] as char;
write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
}
}
impl std::fmt::Display for EvictionSummary {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// wasteful, but it's for testing
let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
for (tenant_shard_id, count) in &self.evicted_per_tenant {
sorted
.entry(count.count)
.or_default()
.push((*tenant_shard_id, count.file_sizes));
}
let total_file_sizes = SiBytesAmount(self.total.file_sizes);
writeln!(
f,
"selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
self.total.count, self.last_absolute, self.last_relative,
)?;
for (count, per_tenant) in sorted.iter().rev().take(10) {
write!(f, "- {count} layers: ")?;
if per_tenant.len() < 3 {
for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
let bytes = SiBytesAmount(*bytes);
write!(f, "{tenant_shard_id} ({bytes})")?;
}
} else {
let num_tenants = per_tenant.len();
let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
let total_bytes = SiBytesAmount(total_bytes);
let layers = num_tenants * count;
write!(
f,
"{num_tenants} tenants {total_bytes} in total {layers} layers",
)?;
}
writeln!(f)?;
}
if sorted.len() > 10 {
let (rem_count, rem_bytes) = sorted
.iter()
.rev()
.map(|(count, per_tenant)| {
(
count,
per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
)
})
.fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
let rem_bytes = SiBytesAmount(rem_bytes);
writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
}
Ok(())
}
}
}
@@ -1187,40 +1336,3 @@ mod filesystem_level_usage {
assert!(!usage.has_pressure());
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn relative_equal_bounds() {
let order = EvictionOrder::RelativeAccessed {
highest_layer_count_loses_first: false,
};
let len = 10;
let v = (0..len)
.map(|i| order.relative_last_activity(len, i).into_inner())
.collect::<Vec<_>>();
assert_eq!(v.first(), Some(&1.0));
assert_eq!(v.last(), Some(&0.0));
assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
}
#[test]
fn relative_spare_bounds() {
let order = EvictionOrder::RelativeAccessed {
highest_layer_count_loses_first: true,
};
let len = 10;
let v = (0..len)
.map(|i| order.relative_last_activity(len, i).into_inner())
.collect::<Vec<_>>();
assert_eq!(v.first(), Some(&1.0));
assert_eq!(v.last(), Some(&0.1));
assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
}
}

View File

@@ -419,6 +419,12 @@ paths:
type: string
format: date-time
description: A timestamp to get the LSN
- name: version
in: query
required: false
schema:
type: integer
description: The version of the endpoint to use
responses:
"200":
description: OK
@@ -668,10 +674,6 @@ paths:
responses:
"200":
description: Tenant is now in requested state
content:
application/json:
schema:
$ref: "#/components/schemas/TenantLocationConfigResponse"
"503":
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
content:
@@ -875,56 +877,6 @@ paths:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
post:
description: |
Marks the initdb archive for preservation upon deletion of the timeline or tenant.
This is meant to be part of the disaster recovery process.
responses:
"202":
description: Tenant scheduled to load successfully
"404":
description: No tenant or timeline found for the specified ids
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/synthetic_size:
parameters:
@@ -1424,27 +1376,6 @@ components:
$ref: '#/components/schemas/SecondaryConfig'
tenant_conf:
$ref: '#/components/schemas/TenantConfig'
TenantLocationConfigResponse:
type: object
required:
- shards
properties:
shards:
description: Pageservers where this tenant's shards are attached. Not populated for secondary locations.
type: array
items:
$ref: "#/components/schemas/TenantShardLocation"
TenantShardLocation:
type: object
required:
- node_id
- shard_id
properties:
node_id:
description: Pageserver node ID where this shard is attached
type: integer
shard_id: Tenant shard ID of the shard
type: string
SecondaryConfig:
type: object
properties:

View File

@@ -17,8 +17,6 @@ use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantLocationConfigResponse;
use pageserver_api::models::TenantShardLocation;
use pageserver_api::models::TenantState;
use pageserver_api::models::{
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
@@ -189,7 +187,6 @@ impl From<TenantSlotUpsertError> for ApiError {
match e {
InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
MapState(e) => e.into(),
ShuttingDown(_) => ApiError::ShuttingDown,
}
}
}
@@ -498,10 +495,6 @@ async fn timeline_create_handler(
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
}
Err(_) if tenant.cancel.is_cancelled() => {
// In case we get some ugly error type during shutdown, cast it into a clean 503.
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
}
Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
json_response(StatusCode::CONFLICT, ())
}
@@ -568,43 +561,6 @@ async fn timeline_list_handler(
json_response(StatusCode::OK, response_data)
}
async fn timeline_preserve_initdb_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
// Part of the process for disaster recovery from safekeeper-stored WAL:
// If we don't recover into a new timeline but want to keep the timeline ID,
// then the initdb archive is deleted. This endpoint copies it to a different
// location where timeline recreation cand find it.
async {
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let timeline = tenant
.get_timeline(timeline_id, false)
.map_err(|e| ApiError::NotFound(e.into()))?;
timeline
.preserve_initdb_archive()
.await
.context("preserving initdb archive")
.map_err(ApiError::InternalServerError)?;
Ok::<_, ApiError>(())
}
.instrument(info_span!("timeline_preserve_initdb_archive",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
%timeline_id))
.await?;
json_response(StatusCode::OK, ())
}
async fn timeline_detail_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -959,7 +915,6 @@ async fn tenant_status(
attachment_status: state.attachment_status(),
generation: tenant.generation().into(),
},
walredo: tenant.wal_redo_manager_status(),
timelines: tenant.list_timeline_ids(),
})
}
@@ -1265,9 +1220,19 @@ async fn tenant_create_handler(
};
// We created the tenant. Existing API semantics are that the tenant
// is Active when this function returns.
new_tenant
if let res @ Err(_) = new_tenant
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
.await?;
.await
{
// This shouldn't happen because we just created the tenant directory
// in upsert_location, and there aren't any remote timelines
// to load, so, nothing can really fail during load.
// Don't do cleanup because we don't know how we got here.
// The tenant will likely be in `Broken` state and subsequent
// calls will fail.
res.context("created tenant failed to become active")
.map_err(ApiError::InternalServerError)?;
}
json_response(
StatusCode::CREATED,
@@ -1359,7 +1324,7 @@ async fn put_tenant_location_config_handler(
let location_conf =
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
let attached = state
state
.tenant_manager
.upsert_location(
tenant_shard_id,
@@ -1368,8 +1333,7 @@ async fn put_tenant_location_config_handler(
tenant::SpawnMode::Normal,
&ctx,
)
.await?
.is_some();
.await?;
if let Some(_flush_ms) = flush {
match state
@@ -1388,18 +1352,7 @@ async fn put_tenant_location_config_handler(
tracing::info!("No flush requested when configuring");
}
// This API returns a vector of pageservers where the tenant is attached: this is
// primarily for use in the sharding service. For compatibilty, we also return this
// when called directly on a pageserver, but the payload is always zero or one shards.
let mut response = TenantLocationConfigResponse { shards: Vec::new() };
if attached {
response.shards.push(TenantShardLocation {
shard_id: tenant_shard_id,
node_id: state.conf.id,
})
}
json_response(StatusCode::OK, response)
json_response(StatusCode::OK, ())
}
async fn list_location_config_handler(
@@ -1990,10 +1943,6 @@ pub fn make_router(
.post("/v1/tenant/:tenant_id/ignore", |r| {
api_handler(r, tenant_ignore_handler)
})
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|r| api_handler(r, timeline_preserve_initdb_handler),
)
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_detail_handler)
})

View File

@@ -1,4 +1,3 @@
#![recursion_limit = "300"]
#![deny(clippy::undocumented_unsafe_blocks)]
mod auth;

View File

@@ -150,43 +150,6 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) struct GetVectoredLatency {
map: EnumMap<TaskKind, Option<Histogram>>,
}
impl GetVectoredLatency {
// Only these task types perform vectored gets. Filter all other tasks out to reduce total
// cardinality of the metric.
const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
self.map[task_kind].as_ref()
}
}
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
let inner = register_histogram_vec!(
"pageserver_get_vectored_seconds",
"Time spent in get_vectored",
&["task_kind"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric");
GetVectoredLatency {
map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
let task_kind = task_kind.into();
Some(inner.with_label_values(&[task_kind]))
} else {
None
}
})),
}
});
pub(crate) struct PageCacheMetricsForTaskKind {
pub read_accesses_materialized_page: IntCounter,
pub read_accesses_immutable: IntCounter,
@@ -969,7 +932,6 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
#[cfg(not(test))]
pub(crate) mod virtual_file_descriptor_cache {
use super::*;
@@ -989,20 +951,6 @@ pub(crate) mod virtual_file_descriptor_cache {
// ```
}
#[cfg(not(test))]
pub(crate) mod virtual_file_io_engine {
use super::*;
pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_virtual_file_io_engine_kind",
"The configured io engine for VirtualFile",
&["kind"],
)
.unwrap()
});
}
#[derive(Debug)]
struct GlobalAndPerTimelineHistogram {
global: Histogram,
@@ -1043,7 +991,6 @@ pub enum SmgrQueryType {
GetRelSize,
GetPageAtLsn,
GetDbSize,
GetSlruSegment,
}
#[derive(Debug)]
@@ -1160,12 +1107,11 @@ mod smgr_query_time_tests {
#[test]
fn op_label_name() {
use super::SmgrQueryType::*;
let expect: [(super::SmgrQueryType, &'static str); 5] = [
let expect: [(super::SmgrQueryType, &'static str); 4] = [
(GetRelExists, "get_rel_exists"),
(GetRelSize, "get_rel_size"),
(GetPageAtLsn, "get_page_at_lsn"),
(GetDbSize, "get_db_size"),
(GetSlruSegment, "get_slru_segment"),
];
for (op, expect) in expect {
let actual: &'static str = op.into();
@@ -1651,18 +1597,11 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
.unwrap()
});
#[rustfmt::skip]
pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_process_launch_duration",
"Histogram of the duration of successful WalRedoProcess::launch calls",
vec![
0.0002, 0.0004, 0.0006, 0.0008, 0.0010,
0.0020, 0.0040, 0.0060, 0.0080, 0.0100,
0.0200, 0.0400, 0.0600, 0.0800, 0.1000,
0.2000, 0.4000, 0.6000, 0.8000, 1.0000,
1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000
],
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric")
});

View File

@@ -22,8 +22,7 @@ use pageserver_api::models::{
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
PagestreamNblocksResponse,
PagestreamNblocksRequest, PagestreamNblocksResponse,
};
use pageserver_api::shard::ShardIndex;
use pageserver_api::shard::{ShardCount, ShardNumber};
@@ -62,7 +61,7 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::pgdatadir_mapping::Version;
use crate::pgdatadir_mapping::{rel_block_to_key, Version};
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -75,8 +74,7 @@ use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
use crate::trace::Tracer;
use pageserver_api::key::rel_block_to_key;
use pageserver_api::reltag::SlruKind;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
@@ -323,8 +321,8 @@ enum PageStreamError {
Shutdown,
/// Something went wrong reading a page: this likely indicates a pageserver bug
#[error("Read error")]
Read(#[source] PageReconstructError),
#[error("Read error: {0}")]
Read(PageReconstructError),
/// Ran out of time waiting for an LSN
#[error("LSN timeout: {0}")]
@@ -333,11 +331,11 @@ enum PageStreamError {
/// The entity required to serve the request (tenant or timeline) is not found,
/// or is not found in a suitable state to serve a request.
#[error("Not found: {0}")]
NotFound(Cow<'static, str>),
NotFound(std::borrow::Cow<'static, str>),
/// Request asked for something that doesn't make sense, like an invalid LSN
#[error("Bad request: {0}")]
BadRequest(Cow<'static, str>),
BadRequest(std::borrow::Cow<'static, str>),
}
impl From<PageReconstructError> for PageStreamError {
@@ -369,16 +367,6 @@ impl From<WaitLsnError> for PageStreamError {
}
}
impl From<WaitLsnError> for QueryError {
fn from(value: WaitLsnError) -> Self {
match value {
e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
WaitLsnError::Shutdown => Self::Shutdown,
WaitLsnError::BadState => Self::Reconnect,
}
}
}
impl PageServerHandler {
pub fn new(
conf: &'static PageServerConf,
@@ -398,18 +386,12 @@ impl PageServerHandler {
/// Future that completes when we need to shut down the connection.
///
/// We currently need to shut down when any of the following happens:
/// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
/// 2. task_mgr requests shutdown of the connection
/// Reasons for need to shut down are:
/// - any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
/// - task_mgr requests shutdown of the connection
///
/// NB on (1): the connection's lifecycle is not actually tied to any of the
/// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
/// implementation to be responsive to timeline cancellation because
/// the connection holds their `GateGuards` open (sored in `shard_timelines`).
/// We currently do the easy thing and terminate the connection if any of the
/// shard_timelines gets cancelled. But really, we cuold spend more effort
/// and simply remove the cancelled timeline from the `shard_timelines`, thereby
/// dropping the guard.
/// The need to check for `task_mgr` cancellation arises mainly from `handle_pagerequests`
/// where, at first, `shard_timelines` is empty, see <https://github.com/neondatabase/neon/pull/6388>
///
/// NB: keep in sync with [`Self::is_connection_cancelled`]
async fn await_connection_cancelled(&self) {
@@ -422,17 +404,16 @@ impl PageServerHandler {
// immutable &self). So it's fine to evaluate shard_timelines after the sleep, we don't risk
// missing any inserts to the map.
let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
use futures::future::Either;
cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
cancellation_sources.extend(
self.shard_timelines
.values()
.map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
);
FuturesUnordered::from_iter(cancellation_sources)
.next()
.await;
let mut futs = self
.shard_timelines
.values()
.map(|ht| ht.timeline.cancel.cancelled())
.collect::<FuturesUnordered<_>>();
tokio::select! {
_ = task_mgr::shutdown_watcher() => { }
_ = futs.next() => {}
}
}
/// Checking variant of [`Self::await_connection_cancelled`].
@@ -648,15 +629,6 @@ impl PageServerHandler {
span,
)
}
PagestreamFeMessage::GetSlruSegment(req) => {
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
(
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
.await,
span,
)
}
};
match response {
@@ -687,10 +659,7 @@ impl PageServerHandler {
// print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
let full = utils::error::report_compact_sources(&e);
span.in_scope(|| {
error!("error reading relation or page version: {full:#}")
});
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})
@@ -1147,33 +1116,6 @@ impl PageServerHandler {
}))
}
async fn handle_get_slru_segment_request(
&mut self,
tenant_id: TenantId,
timeline_id: TimelineId,
req: &PagestreamGetSlruSegmentRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
let _timer = timeline
.query_metrics
.start_timer(metrics::SmgrQueryType::GetSlruSegment);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let kind = SlruKind::from_repr(req.kind)
.ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;
Ok(PagestreamBeMessage::GetSlruSegment(
PagestreamGetSlruSegmentResponse { segment },
))
}
#[allow(clippy::too_many_arguments)]
#[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
async fn handle_basebackup_request<IO>(
@@ -1186,7 +1128,7 @@ impl PageServerHandler {
full_backup: bool,
gzip: bool,
ctx: RequestContext,
) -> Result<(), QueryError>
) -> anyhow::Result<()>
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
@@ -1451,7 +1393,7 @@ where
)
.await?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
Result::<(), QueryError>::Ok(())
anyhow::Ok(())
},
)
.await?;
@@ -1725,7 +1667,6 @@ impl From<GetActiveTenantError> for QueryError {
| GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
QueryError::Shutdown
}
e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()),
e => QueryError::Other(anyhow::anyhow!(e)),
}
}

View File

@@ -12,13 +12,8 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::repository::*;
use crate::walrecord::NeonWalRecord;
use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use pageserver_api::key::{
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use bytes::{Buf, Bytes};
use pageserver_api::key::is_rel_block_key;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
@@ -27,7 +22,6 @@ use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap, HashSet};
use std::ops::ControlFlow;
use std::ops::Range;
use strum::IntoEnumIterator;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
@@ -321,27 +315,6 @@ impl Timeline {
}
}
/// Get the whole SLRU segment
pub(crate) async fn get_slru_segment(
&self,
kind: SlruKind,
segno: u32,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
let n_blocks = self
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
.await?;
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
for blkno in 0..n_blocks {
let block = self
.get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
.await?;
segment.extend_from_slice(&block[..BLCKSZ as usize]);
}
Ok(segment.freeze())
}
/// Look up given SLRU page version.
pub(crate) async fn get_slru_page_at_lsn(
&self,
@@ -555,33 +528,6 @@ impl Timeline {
Ok(Default::default())
}
pub(crate) async fn get_slru_keyspace(
&self,
version: Version<'_>,
ctx: &RequestContext,
) -> Result<KeySpace, PageReconstructError> {
let mut accum = KeySpaceAccum::new();
for kind in SlruKind::iter() {
let mut segments: Vec<u32> = self
.list_slru_segments(kind, version, ctx)
.await?
.into_iter()
.collect();
segments.sort_unstable();
for seg in segments {
let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?;
accum.add_range(
slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count),
);
}
}
Ok(accum.to_keyspace())
}
/// Get a list of SLRU segments
pub(crate) async fn list_slru_segments(
&self,
@@ -1589,6 +1535,366 @@ struct SlruSegmentDirectory {
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
// Layout of the Key address space
//
// The Key struct, used to address the underlying key-value store, consists of
// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
// all the data and metadata keys into those 18 bytes.
//
// Principles for the mapping:
//
// - Things that are often accessed or modified together, should be close to
// each other in the key space. For example, if a relation is extended by one
// block, we create a new key-value pair for the block data, and update the
// relation size entry. Because of that, the RelSize key comes after all the
// RelBlocks of a relation: the RelSize and the last RelBlock are always next
// to each other.
//
// The key space is divided into four major sections, identified by the first
// byte, and the form a hierarchy:
//
// 00 Relation data and metadata
//
// DbDir () -> (dbnode, spcnode)
// Filenodemap
// RelDir -> relnode forknum
// RelBlocks
// RelSize
//
// 01 SLRUs
//
// SlruDir kind
// SlruSegBlocks segno
// SlruSegSize
//
// 02 pg_twophase
//
// 03 misc
// Controlfile
// checkpoint
// pg_version
//
// 04 aux files
//
// Below is a full list of the keyspace allocation:
//
// DbDir:
// 00 00000000 00000000 00000000 00 00000000
//
// Filenodemap:
// 00 SPCNODE DBNODE 00000000 00 00000000
//
// RelDir:
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
//
// RelBlock:
// 00 SPCNODE DBNODE RELNODE FORK BLKNUM
//
// RelSize:
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
//
// SlruDir:
// 01 kind 00000000 00000000 00 00000000
//
// SlruSegBlock:
// 01 kind 00000001 SEGNO 00 BLKNUM
//
// SlruSegSize:
// 01 kind 00000001 SEGNO 00 FFFFFFFF
//
// TwoPhaseDir:
// 02 00000000 00000000 00000000 00 00000000
//
// TwoPhaseFile:
// 02 00000000 00000000 00000000 00 XID
//
// ControlFile:
// 03 00000000 00000000 00000000 00 00000000
//
// Checkpoint:
// 03 00000000 00000000 00000000 00 00000001
//
// AuxFiles:
// 03 00000000 00000000 00000000 00 00000002
//
//-- Section 01: relation data and metadata
const DBDIR_KEY: Key = Key {
field1: 0x00,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
};
fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0xffffffff,
field5: 0xff,
field6: 0xffffffff,
}
}
fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 0,
}
}
fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
Key {
field1: 0x00,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 1,
}
}
pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: blknum,
}
}
fn rel_size_to_key(rel: RelTag) -> Key {
Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: 0xffffffff,
}
}
fn rel_key_range(rel: RelTag) -> Range<Key> {
Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum,
field6: 0,
}..Key {
field1: 0x00,
field2: rel.spcnode,
field3: rel.dbnode,
field4: rel.relnode,
field5: rel.forknum + 1,
field6: 0,
}
}
//-- Section 02: SLRUs
fn slru_dir_to_key(kind: SlruKind) -> Key {
Key {
field1: 0x01,
field2: match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
},
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
Key {
field1: 0x01,
field2: match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
},
field3: 1,
field4: segno,
field5: 0,
field6: blknum,
}
}
fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
Key {
field1: 0x01,
field2: match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
},
field3: 1,
field4: segno,
field5: 0,
field6: 0xffffffff,
}
}
fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
let field2 = match kind {
SlruKind::Clog => 0x00,
SlruKind::MultiXactMembers => 0x01,
SlruKind::MultiXactOffsets => 0x02,
};
Key {
field1: 0x01,
field2,
field3: 1,
field4: segno,
field5: 0,
field6: 0,
}..Key {
field1: 0x01,
field2,
field3: 1,
field4: segno,
field5: 1,
field6: 0,
}
}
//-- Section 03: pg_twophase
const TWOPHASEDIR_KEY: Key = Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
};
fn twophase_file_key(xid: TransactionId) -> Key {
Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: xid,
}
}
fn twophase_key_range(xid: TransactionId) -> Range<Key> {
let (next_xid, overflowed) = xid.overflowing_add(1);
Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: xid,
}..Key {
field1: 0x02,
field2: 0,
field3: 0,
field4: 0,
field5: u8::from(overflowed),
field6: next_xid,
}
}
//-- Section 03: Control file
const CONTROLFILE_KEY: Key = Key {
field1: 0x03,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
};
const CHECKPOINT_KEY: Key = Key {
field1: 0x03,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 1,
};
const AUX_FILES_KEY: Key = Key {
field1: 0x03,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 2,
};
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
pub fn is_inherited_key(key: Key) -> bool {
key != AUX_FILES_KEY
}
pub fn is_rel_fsm_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
}
pub fn is_rel_vm_block_key(key: Key) -> bool {
key.field1 == 0x00
&& key.field4 != 0
&& key.field5 == VISIBILITYMAP_FORKNUM
&& key.field6 != 0xffffffff
}
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
Ok(match key.field1 {
0x01 => {
let kind = match key.field2 {
0x00 => SlruKind::Clog,
0x01 => SlruKind::MultiXactMembers,
0x02 => SlruKind::MultiXactOffsets,
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
};
let segno = key.field4;
let blknum = key.field6;
(kind, segno, blknum)
}
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
fn is_slru_block_key(key: Key) -> bool {
key.field1 == 0x01 // SLRU-related
&& key.field3 == 0x00000001 // but not SlruDir
&& key.field6 != 0xffffffff // and not SlruSegSize
}
#[allow(clippy::bool_assert_comparison)]
#[cfg(test)]
mod tests {

View File

@@ -20,7 +20,6 @@ use futures::FutureExt;
use futures::StreamExt;
use pageserver_api::models;
use pageserver_api::models::TimelineState;
use pageserver_api::models::WalRedoManagerStatus;
use pageserver_api::shard::ShardIdentity;
use pageserver_api::shard::TenantShardId;
use remote_storage::DownloadError;
@@ -92,6 +91,7 @@ use std::fs;
use std::fs::File;
use std::io;
use std::ops::Bound::Included;
use std::process::Stdio;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;
@@ -365,14 +365,6 @@ impl WalRedoManager {
}
}
}
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
match self {
WalRedoManager::Prod(m) => m.status(),
#[cfg(test)]
WalRedoManager::Test(_) => None,
}
}
}
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
@@ -636,15 +628,9 @@ impl Tenant {
deletion_queue_client,
));
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
// we shut down while attaching.
let Ok(attach_gate_guard) = tenant.gate.enter() else {
// We just created the Tenant: nothing else can have shut it down yet
unreachable!();
};
// Do all the hard work in the background
let tenant_clone = Arc::clone(&tenant);
let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
task_mgr::spawn(
&tokio::runtime::Handle::current(),
@@ -654,8 +640,6 @@ impl Tenant {
"attach tenant",
false,
async move {
let _gate_guard = attach_gate_guard;
// Is this tenant being spawned as part of process startup?
let starting_up = init_order.is_some();
scopeguard::defer! {
@@ -732,10 +716,6 @@ impl Tenant {
// stayed in Activating for such a long time that shutdown found it in
// that state.
tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
// Make the tenant broken so that set_stopping will not hang waiting for it to leave
// the Attaching state. This is an over-reaction (nothing really broke, the tenant is
// just shutting down), but ensures progress.
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
return Ok(());
},
)
@@ -830,7 +810,7 @@ impl Tenant {
SpawnMode::Create => None,
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
};
match tenant_clone.attach(preload, mode, &ctx).await {
match tenant_clone.attach(preload, &ctx).await {
Ok(()) => {
info!("attach finished, activating");
if let Some(t)= attach_timer {t.observe_duration();}
@@ -917,20 +897,15 @@ impl Tenant {
async fn attach(
self: &Arc<Tenant>,
preload: Option<TenantPreload>,
mode: SpawnMode,
ctx: &RequestContext,
) -> anyhow::Result<()> {
span::debug_assert_current_span_has_tenant_id();
failpoint_support::sleep_millis_async!("before-attaching-tenant");
let preload = match (preload, mode) {
(Some(p), _) => p,
(None, SpawnMode::Create) => TenantPreload {
deleting: false,
timelines: HashMap::new(),
},
(None, SpawnMode::Normal) => {
let preload = match preload {
Some(p) => p,
None => {
// Deprecated dev mode: load from local disk state instead of remote storage
// https://github.com/neondatabase/neon/issues/5624
return self.load_local(ctx).await;
@@ -1029,7 +1004,6 @@ impl Tenant {
Some(remote_timeline_client),
self.deletion_queue_client.clone(),
)
.instrument(tracing::info_span!("timeline_delete", %timeline_id))
.await
.context("resume_deletion")
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
@@ -1039,10 +1013,7 @@ impl Tenant {
// IndexPart is the source of truth.
self.clean_up_timelines(&existent_timelines)?;
fail::fail_point!("attach-before-activate", |_| {
anyhow::bail!("attach-before-activate");
});
failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel);
failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);
info!("Done");
@@ -1706,13 +1677,9 @@ impl Tenant {
ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
if !self.is_active() {
if matches!(self.current_state(), TenantState::Stopping { .. }) {
return Err(CreateTimelineError::ShuttingDown);
} else {
return Err(CreateTimelineError::Other(anyhow::anyhow!(
"Cannot create timelines on inactive tenant"
)));
}
return Err(CreateTimelineError::Other(anyhow::anyhow!(
"Cannot create timelines on inactive tenant"
)));
}
let _gate = self
@@ -1966,10 +1933,6 @@ impl Tenant {
self.generation
}
pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
self.walredo_mgr.status()
}
/// Changes tenant status to active, unless shutdown was already requested.
///
/// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
@@ -2107,10 +2070,7 @@ impl Tenant {
let timelines = self.timelines.lock().unwrap();
timelines.values().for_each(|timeline| {
let timeline = Arc::clone(timeline);
let timeline_id = timeline.timeline_id;
let span =
tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
let span = Span::current();
js.spawn(async move {
if freeze_and_flush {
timeline.flush_and_shutdown().instrument(span).await
@@ -2710,7 +2670,7 @@ impl Tenant {
activate_now_sem: tokio::sync::Semaphore::new(0),
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
cancel: CancellationToken::default(),
gate: Gate::default(),
gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
}
}
@@ -3795,30 +3755,27 @@ async fn run_initdb(
.env_clear()
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
.stdin(std::process::Stdio::null())
// stdout invocation produces the same output every time, we don't need it
.stdout(std::process::Stdio::null())
// we would be interested in the stderr output, if there was any
.stderr(std::process::Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
// If the `select!` below doesn't finish the `wait_with_output`,
// let the task get `wait()`ed for asynchronously by tokio.
// This means there is a slim chance we can go over the INIT_DB_SEMAPHORE.
// TODO: fix for this is non-trivial, see
// https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021
//
.kill_on_drop(true)
.spawn()?;
// Ideally we'd select here with the cancellation token, but the problem is that
// we can't safely terminate initdb: it launches processes of its own, and killing
// initdb doesn't kill them. After we return from this function, we want the target
// directory to be able to be cleaned up.
// See https://github.com/neondatabase/neon/issues/6385
let initdb_output = initdb_command.wait_with_output().await?;
if !initdb_output.status.success() {
return Err(InitdbError::Failed(
initdb_output.status,
initdb_output.stderr,
));
}
// This isn't true cancellation support, see above. Still return an error to
// excercise the cancellation code path.
if cancel.is_cancelled() {
return Err(InitdbError::Cancelled);
tokio::select! {
initdb_output = initdb_command.wait_with_output() => {
let initdb_output = initdb_output?;
if !initdb_output.status.success() {
return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
}
}
_ = cancel.cancelled() => {
return Err(InitdbError::Cancelled);
}
}
Ok(())
@@ -3920,7 +3877,6 @@ pub(crate) mod harness {
),
gc_feedback: Some(tenant_conf.gc_feedback),
heatmap_period: Some(tenant_conf.heatmap_period),
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
}
}
}
@@ -4075,7 +4031,7 @@ pub(crate) mod harness {
.instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
.await?;
tenant
.attach(Some(preload), SpawnMode::Normal, ctx)
.attach(Some(preload), ctx)
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
.await?;
}
@@ -5243,7 +5199,7 @@ mod tests {
let raw_tline = tline.raw_timeline().unwrap();
raw_tline
.shutdown()
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID))
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id))
.await;
std::mem::forget(tline);
}

View File

@@ -5,10 +5,10 @@
use super::ephemeral_file::EphemeralFile;
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
use crate::context::RequestContext;
use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
use crate::virtual_file::VirtualFile;
use bytes::Bytes;
use std::ops::Deref;
use std::ops::{Deref, DerefMut};
/// This is implemented by anything that can read 8 kB (PAGE_SZ)
/// blocks, using the page cache
@@ -39,8 +39,6 @@ pub enum BlockLease<'a> {
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
#[cfg(test)]
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
#[cfg(test)]
Vec(Vec<u8>),
}
impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -65,10 +63,6 @@ impl<'a> Deref for BlockLease<'a> {
BlockLease::EphemeralFileMutableTail(v) => v,
#[cfg(test)]
BlockLease::Arc(v) => v.deref(),
#[cfg(test)]
BlockLease::Vec(v) => {
TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
}
}
}
}
@@ -175,14 +169,10 @@ impl FileBlockReader {
}
/// Read a page from the underlying file into given buffer.
async fn fill_buffer(
&self,
buf: PageWriteGuard<'static>,
blkno: u32,
) -> Result<PageWriteGuard<'static>, std::io::Error> {
async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
assert!(buf.len() == PAGE_SZ);
self.file
.read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
.await
}
/// Read a block.
@@ -206,9 +196,9 @@ impl FileBlockReader {
)
})? {
ReadBufResult::Found(guard) => Ok(guard.into()),
ReadBufResult::NotFound(write_guard) => {
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
let write_guard = self.fill_buffer(write_guard, blknum).await?;
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
Ok(write_guard.mark_valid().into())
}
}

View File

@@ -345,9 +345,6 @@ pub struct TenantConf {
/// may be disabled if a Tenant will not have secondary locations: only secondary
/// locations will use the heatmap uploaded by attached locations.
pub heatmap_period: Duration,
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
pub lazy_slru_download: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -433,10 +430,6 @@ pub struct TenantConfOpt {
#[serde(with = "humantime_serde")]
#[serde(default)]
pub heatmap_period: Option<Duration>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub lazy_slru_download: Option<bool>,
}
impl TenantConfOpt {
@@ -482,9 +475,6 @@ impl TenantConfOpt {
.unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
lazy_slru_download: self
.lazy_slru_download
.unwrap_or(global_conf.lazy_slru_download),
}
}
}
@@ -523,7 +513,6 @@ impl Default for TenantConf {
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
gc_feedback: false,
heatmap_period: Duration::ZERO,
lazy_slru_download: false,
}
}
}
@@ -595,7 +584,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
.map(humantime),
gc_feedback: value.gc_feedback,
heatmap_period: value.heatmap_period.map(humantime),
lazy_slru_download: value.lazy_slru_download,
}
}
}

View File

@@ -136,11 +136,7 @@ async fn schedule_ordered_timeline_deletions(
let mut already_running_deletions = vec![];
for (timeline_id, _) in sorted.into_iter().rev() {
let span = tracing::info_span!("timeline_delete", %timeline_id);
let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
.instrument(span)
.await;
if let Err(e) = res {
if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
match e {
DeleteTimelineError::NotFound => {
// Timeline deletion finished after call to clone above but before call
@@ -413,10 +409,7 @@ impl DeleteTenantFlow {
.await
.expect("cant be stopping or broken");
tenant
.attach(preload, super::SpawnMode::Normal, ctx)
.await
.context("attach")?;
tenant.attach(preload, ctx).await.context("attach")?;
Self::background(
guard,

View File

@@ -5,11 +5,11 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::{self, VirtualFile};
use crate::virtual_file::VirtualFile;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use std::cmp::min;
use std::fs::OpenOptions;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::sync::atomic::AtomicU64;
@@ -47,10 +47,7 @@ impl EphemeralFile {
let file = VirtualFile::open_with_options(
&filename,
virtual_file::OpenOptions::new()
.read(true)
.write(true)
.create(true),
OpenOptions::new().read(true).write(true).create(true),
)
.await?;
@@ -92,10 +89,11 @@ impl EphemeralFile {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(write_guard) => {
let write_guard = self
.file
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
page_cache::ReadBufResult::NotFound(mut write_guard) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
self.file
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
.await?;
let read_guard = write_guard.mark_valid();
return Ok(BlockLease::PageReadGuard(read_guard));

View File

@@ -51,10 +51,7 @@ use crate::keyspace::KeyPartitioning;
use crate::repository::Key;
use crate::tenant::storage_layer::InMemoryLayer;
use anyhow::Result;
use pageserver_api::keyspace::KeySpaceAccum;
use std::cmp::Ordering;
use std::collections::{BTreeMap, VecDeque};
use std::iter::Peekable;
use std::collections::VecDeque;
use std::ops::Range;
use std::sync::Arc;
use utils::lsn::Lsn;
@@ -147,221 +144,11 @@ impl Drop for BatchedUpdates<'_> {
}
/// Return value of LayerMap::search
#[derive(Eq, PartialEq, Debug)]
pub struct SearchResult {
pub layer: Arc<PersistentLayerDesc>,
pub lsn_floor: Lsn,
}
pub struct OrderedSearchResult(SearchResult);
impl Ord for OrderedSearchResult {
fn cmp(&self, other: &Self) -> Ordering {
self.0.lsn_floor.cmp(&other.0.lsn_floor)
}
}
impl PartialOrd for OrderedSearchResult {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for OrderedSearchResult {
fn eq(&self, other: &Self) -> bool {
self.0.lsn_floor == other.0.lsn_floor
}
}
impl Eq for OrderedSearchResult {}
pub struct RangeSearchResult {
pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
pub not_found: KeySpaceAccum,
}
impl RangeSearchResult {
fn new() -> Self {
Self {
found: BTreeMap::new(),
not_found: KeySpaceAccum::new(),
}
}
}
/// Collector for results of range search queries on the LayerMap.
/// It should be provided with two iterators for the delta and image coverage
/// that contain all the changes for layers which intersect the range.
struct RangeSearchCollector<Iter>
where
Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
{
delta_coverage: Peekable<Iter>,
image_coverage: Peekable<Iter>,
key_range: Range<Key>,
end_lsn: Lsn,
current_delta: Option<Arc<PersistentLayerDesc>>,
current_image: Option<Arc<PersistentLayerDesc>>,
result: RangeSearchResult,
}
#[derive(Debug)]
enum NextLayerType {
Delta(i128),
Image(i128),
Both(i128),
}
impl NextLayerType {
fn next_change_at_key(&self) -> Key {
match self {
NextLayerType::Delta(at) => Key::from_i128(*at),
NextLayerType::Image(at) => Key::from_i128(*at),
NextLayerType::Both(at) => Key::from_i128(*at),
}
}
}
impl<Iter> RangeSearchCollector<Iter>
where
Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
{
fn new(
key_range: Range<Key>,
end_lsn: Lsn,
delta_coverage: Iter,
image_coverage: Iter,
) -> Self {
Self {
delta_coverage: delta_coverage.peekable(),
image_coverage: image_coverage.peekable(),
key_range,
end_lsn,
current_delta: None,
current_image: None,
result: RangeSearchResult::new(),
}
}
/// Run the collector. Collection is implemented via a two pointer algorithm.
/// One pointer tracks the start of the current range and the other tracks
/// the beginning of the next range which will overlap with the next change
/// in coverage across both image and delta.
fn collect(mut self) -> RangeSearchResult {
let next_layer_type = self.choose_next_layer_type();
let mut current_range_start = match next_layer_type {
None => {
// No changes for the range
self.pad_range(self.key_range.clone());
return self.result;
}
Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => {
// Changes only after the end of the range
self.pad_range(self.key_range.clone());
return self.result;
}
Some(layer_type) => {
// Changes for the range exist. Record anything before the first
// coverage change as not found.
let coverage_start = layer_type.next_change_at_key();
let range_before = self.key_range.start..coverage_start;
self.pad_range(range_before);
self.advance(&layer_type);
coverage_start
}
};
while current_range_start < self.key_range.end {
let next_layer_type = self.choose_next_layer_type();
match next_layer_type {
Some(t) => {
let current_range_end = t.next_change_at_key();
self.add_range(current_range_start..current_range_end);
current_range_start = current_range_end;
self.advance(&t);
}
None => {
self.add_range(current_range_start..self.key_range.end);
current_range_start = self.key_range.end;
}
}
}
self.result
}
/// Mark a range as not found (i.e. no layers intersect it)
fn pad_range(&mut self, key_range: Range<Key>) {
if !key_range.is_empty() {
self.result.not_found.add_range(key_range);
}
}
/// Select the appropiate layer for the given range and update
/// the collector.
fn add_range(&mut self, covered_range: Range<Key>) {
let selected = LayerMap::select_layer(
self.current_delta.clone(),
self.current_image.clone(),
self.end_lsn,
);
match selected {
Some(search_result) => self
.result
.found
.entry(OrderedSearchResult(search_result))
.or_default()
.add_range(covered_range),
None => self.pad_range(covered_range),
}
}
/// Move to the next coverage change.
fn advance(&mut self, layer_type: &NextLayerType) {
match layer_type {
NextLayerType::Delta(_) => {
let (_, layer) = self.delta_coverage.next().unwrap();
self.current_delta = layer;
}
NextLayerType::Image(_) => {
let (_, layer) = self.image_coverage.next().unwrap();
self.current_image = layer;
}
NextLayerType::Both(_) => {
let (_, image_layer) = self.image_coverage.next().unwrap();
let (_, delta_layer) = self.delta_coverage.next().unwrap();
self.current_image = image_layer;
self.current_delta = delta_layer;
}
}
}
/// Pick the next coverage change: the one at the lesser key or both if they're alligned.
fn choose_next_layer_type(&mut self) -> Option<NextLayerType> {
let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key);
let next_image_at = self.image_coverage.peek().map(|(key, _)| key);
match (next_delta_at, next_image_at) {
(None, None) => None,
(Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)),
(None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)),
(Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => {
Some(NextLayerType::Image(*next_image_at))
}
(Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => {
Some(NextLayerType::Delta(*next_delta_at))
}
(Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)),
}
}
}
impl LayerMap {
///
/// Find the latest layer (by lsn.end) that covers the given
@@ -399,18 +186,7 @@ impl LayerMap {
let latest_delta = version.delta_coverage.query(key.to_i128());
let latest_image = version.image_coverage.query(key.to_i128());
Self::select_layer(latest_delta, latest_image, end_lsn)
}
fn select_layer(
delta_layer: Option<Arc<PersistentLayerDesc>>,
image_layer: Option<Arc<PersistentLayerDesc>>,
end_lsn: Lsn,
) -> Option<SearchResult> {
assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
match (delta_layer, image_layer) {
match (latest_delta, latest_image) {
(None, None) => None,
(None, Some(image)) => {
let lsn_floor = image.get_lsn_range().start;
@@ -447,17 +223,6 @@ impl LayerMap {
}
}
pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
let image_changes = version.image_coverage.range_overlaps(&raw_range);
let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
Some(collector.collect())
}
/// Start a batch of updates, applied on drop
pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
BatchedUpdates { layer_map: self }
@@ -518,15 +283,15 @@ impl LayerMap {
///
/// This is used for garbage collection, to determine if an old layer can
/// be deleted.
pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> bool {
pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
if key.is_empty() {
// Vacuously true. There's a newer image for all 0 of the kerys in the range.
return true;
return Ok(true);
}
let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
Some(v) => v,
None => return false,
None => return Ok(false),
};
let start = key.start.to_i128();
@@ -539,17 +304,17 @@ impl LayerMap {
// Check the start is covered
if !layer_covers(version.image_coverage.query(start)) {
return false;
return Ok(false);
}
// Check after all changes of coverage
for (_, change_val) in version.image_coverage.range(start..end) {
if !layer_covers(change_val) {
return false;
return Ok(false);
}
}
true
Ok(true)
}
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
@@ -560,14 +325,18 @@ impl LayerMap {
/// Divide the whole given range of keys into sub-ranges based on the latest
/// image layer that covers each range at the specified lsn (inclusive).
/// This is used when creating new image layers.
///
// FIXME: clippy complains that the result type is very complex. She's probably
// right...
#[allow(clippy::type_complexity)]
pub fn image_coverage(
&self,
key_range: &Range<Key>,
lsn: Lsn,
) -> Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> {
) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
let version = match self.historic.get().unwrap().get_version(lsn.0) {
Some(v) => v,
None => return vec![],
None => return Ok(vec![]),
};
let start = key_range.start.to_i128();
@@ -590,7 +359,7 @@ impl LayerMap {
let kr = Key::from_i128(current_key)..Key::from_i128(end);
coverage.push((kr, current_val.take()));
coverage
Ok(coverage)
}
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
@@ -641,19 +410,24 @@ impl LayerMap {
/// This number is used to compute the largest number of deltas that
/// we'll need to visit for any page reconstruction in this region.
/// We use this heuristic to decide whether to create an image layer.
pub fn count_deltas(&self, key: &Range<Key>, lsn: &Range<Lsn>, limit: Option<usize>) -> usize {
pub fn count_deltas(
&self,
key: &Range<Key>,
lsn: &Range<Lsn>,
limit: Option<usize>,
) -> Result<usize> {
// We get the delta coverage of the region, and for each part of the coverage
// we recurse right underneath the delta. The recursion depth is limited by
// the largest result this function could return, which is in practice between
// 3 and 10 (since we usually try to create an image when the number gets larger).
if lsn.is_empty() || key.is_empty() || limit == Some(0) {
return 0;
return Ok(0);
}
let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
Some(v) => v,
None => return 0,
None => return Ok(0),
};
let start = key.start.to_i128();
@@ -674,7 +448,8 @@ impl LayerMap {
if !kr.is_empty() {
let base_count = Self::is_reimage_worthy(&val, key) as usize;
let new_limit = limit.map(|l| l - base_count);
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
let max_stacked_deltas_underneath =
self.count_deltas(&kr, &lr, new_limit)?;
max_stacked_deltas = std::cmp::max(
max_stacked_deltas,
base_count + max_stacked_deltas_underneath,
@@ -696,7 +471,7 @@ impl LayerMap {
if !kr.is_empty() {
let base_count = Self::is_reimage_worthy(&val, key) as usize;
let new_limit = limit.map(|l| l - base_count);
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
max_stacked_deltas = std::cmp::max(
max_stacked_deltas,
base_count + max_stacked_deltas_underneath,
@@ -705,7 +480,7 @@ impl LayerMap {
}
}
max_stacked_deltas
Ok(max_stacked_deltas)
}
/// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
@@ -817,7 +592,10 @@ impl LayerMap {
if limit == Some(difficulty) {
break;
}
for (img_range, last_img) in self.image_coverage(range, lsn) {
for (img_range, last_img) in self
.image_coverage(range, lsn)
.expect("why would this err?")
{
if limit == Some(difficulty) {
break;
}
@@ -828,7 +606,9 @@ impl LayerMap {
};
if img_lsn < lsn {
let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
let num_deltas = self
.count_deltas(&img_range, &(img_lsn..lsn), limit)
.expect("why would this err lol?");
difficulty = std::cmp::max(difficulty, num_deltas);
}
}
@@ -866,126 +646,3 @@ impl LayerMap {
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[derive(Clone)]
struct LayerDesc {
key_range: Range<Key>,
lsn_range: Range<Lsn>,
is_delta: bool,
}
fn create_layer_map(layers: Vec<LayerDesc>) -> LayerMap {
let mut layer_map = LayerMap::default();
for layer in layers {
layer_map.insert_historic_noflush(PersistentLayerDesc::new_test(
layer.key_range,
layer.lsn_range,
layer.is_delta,
));
}
layer_map.flush_updates();
layer_map
}
fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
let lhs: Vec<_> = lhs
.found
.into_iter()
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
.collect();
let rhs: Vec<_> = rhs
.found
.into_iter()
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
.collect();
assert_eq!(lhs, rhs);
}
fn brute_force_range_search(
layer_map: &LayerMap,
key_range: Range<Key>,
end_lsn: Lsn,
) -> RangeSearchResult {
let mut range_search_result = RangeSearchResult::new();
let mut key = key_range.start;
while key != key_range.end {
let res = layer_map.search(key, end_lsn);
match res {
Some(res) => {
range_search_result
.found
.entry(OrderedSearchResult(res))
.or_default()
.add_key(key);
}
None => {
range_search_result.not_found.add_key(key);
}
}
key = key.next();
}
range_search_result
}
#[test]
fn ranged_search_on_empty_layer_map() {
let layer_map = LayerMap::default();
let range = Key::from_i128(100)..Key::from_i128(200);
let res = layer_map.range_search(range, Lsn(100));
assert!(res.is_none());
}
#[test]
fn ranged_search() {
let layers = vec![
LayerDesc {
key_range: Key::from_i128(15)..Key::from_i128(50),
lsn_range: Lsn(0)..Lsn(5),
is_delta: false,
},
LayerDesc {
key_range: Key::from_i128(10)..Key::from_i128(20),
lsn_range: Lsn(5)..Lsn(20),
is_delta: true,
},
LayerDesc {
key_range: Key::from_i128(15)..Key::from_i128(25),
lsn_range: Lsn(20)..Lsn(30),
is_delta: true,
},
LayerDesc {
key_range: Key::from_i128(35)..Key::from_i128(40),
lsn_range: Lsn(25)..Lsn(35),
is_delta: true,
},
LayerDesc {
key_range: Key::from_i128(35)..Key::from_i128(40),
lsn_range: Lsn(35)..Lsn(40),
is_delta: false,
},
];
let layer_map = create_layer_map(layers.clone());
for start in 0..60 {
for end in (start + 1)..60 {
let range = Key::from_i128(start)..Key::from_i128(end);
let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
let expected = brute_force_range_search(&layer_map, range, Lsn(100));
assert_range_search_result_eq(result, expected);
}
}
}
}

View File

@@ -129,42 +129,6 @@ impl<Value: Clone> LayerCoverage<Value> {
.map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
}
/// Returns an iterator which includes all coverage changes for layers that intersect
/// with the provided range.
pub fn range_overlaps(
&self,
key_range: &Range<i128>,
) -> impl Iterator<Item = (i128, Option<Value>)> + '_
where
Value: Eq,
{
let first_change = self.query(key_range.start);
match first_change {
Some(change) => {
// If the start of the range is covered, we have to deal with two cases:
// 1. Start of the range is aligned with the start of a layer.
// In this case the return of `self.range` will contain the layer which aligns with the start of the key range.
// We advance said iterator to avoid duplicating the first change.
// 2. Start of the range is not aligned with the start of a layer.
let range = key_range.start..key_range.end;
let mut range_coverage = self.range(range).peekable();
if range_coverage
.peek()
.is_some_and(|c| c.1.as_ref() == Some(&change))
{
range_coverage.next();
}
itertools::Either::Left(
std::iter::once((key_range.start, Some(change))).chain(range_coverage),
)
}
None => {
let range = key_range.start..key_range.end;
let coverage = self.range(range);
itertools::Either::Right(coverage)
}
}
}
/// O(1) clone
pub fn clone(&self) -> Self {
Self {

View File

@@ -7,7 +7,6 @@ use pageserver_api::models::ShardParameters;
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
use rand::{distributions::Alphanumeric, Rng};
use std::borrow::Cow;
use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap};
use std::ops::Deref;
use std::sync::Arc;
@@ -33,8 +32,7 @@ use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
TenantConfOpt,
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
};
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
@@ -468,26 +466,6 @@ pub async fn init_tenant_mgr(
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
if let Some(gen) = generations.get(&tenant_shard_id) {
if let LocationMode::Attached(attached) = &location_conf.mode {
if attached.generation > *gen {
tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
attached.generation
);
// We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
// local disk content: demote to secondary rather than detaching.
tenants.insert(
tenant_shard_id,
TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
location_conf.shard,
location_conf.tenant_conf,
&SecondaryLocationConfig { warm: false },
)),
);
}
}
*gen
} else {
match &location_conf.mode {
@@ -743,7 +721,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
tokio::select! {
Some(joined) = join_set.join_next() => {
match joined {
Ok(()) => {},
Ok(()) => {}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the tasks");
}
@@ -904,7 +882,7 @@ impl TenantManager {
tenant_shard_id: TenantShardId,
new_location_config: LocationConf,
flush: Option<Duration>,
mut spawn_mode: SpawnMode,
spawn_mode: SpawnMode,
ctx: &RequestContext,
) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
debug_assert_current_span_has_tenant_id();
@@ -924,29 +902,19 @@ impl TenantManager {
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
match (&new_location_config.mode, peek_slot) {
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
match attach_conf.generation.cmp(&tenant.generation) {
Ordering::Equal => {
// A transition from Attached to Attached in the same generation, we may
// take our fast path and just provide the updated configuration
// to the tenant.
tenant.set_new_location_config(
AttachedTenantConf::try_from(new_location_config.clone())
.map_err(UpsertLocationError::BadRequest)?,
);
if attach_conf.generation == tenant.generation {
// A transition from Attached to Attached in the same generation, we may
// take our fast path and just provide the updated configuration
// to the tenant.
tenant.set_new_location_config(
AttachedTenantConf::try_from(new_location_config.clone())
.map_err(UpsertLocationError::BadRequest)?,
);
Some(FastPathModified::Attached(tenant.clone()))
}
Ordering::Less => {
return Err(UpsertLocationError::BadRequest(anyhow::anyhow!(
"Generation {:?} is less than existing {:?}",
attach_conf.generation,
tenant.generation
)));
}
Ordering::Greater => {
// Generation advanced, fall through to general case of replacing `Tenant` object
None
}
Some(FastPathModified::Attached(tenant.clone()))
} else {
// Different generations, fall through to general case
None
}
}
(
@@ -1051,12 +1019,6 @@ impl TenantManager {
}
}
slot_guard.drop_old_value().expect("We just shut it down");
// Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
// the caller thinks they're creating but the tenant already existed. We must switch to
// Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
// rather than assuming it to be empty.
spawn_mode = SpawnMode::Normal;
}
Some(TenantSlot::Secondary(state)) => {
info!("Shutting down secondary tenant");
@@ -1140,46 +1102,14 @@ impl TenantManager {
None
};
match slot_guard.upsert(new_slot) {
Err(TenantSlotUpsertError::InternalError(e)) => {
Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
slot_guard.upsert(new_slot).map_err(|e| match e {
TenantSlotUpsertError::InternalError(e) => {
UpsertLocationError::Other(anyhow::anyhow!(e))
}
Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
// If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then
// we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants
// are shutdown.
//
// We must shut it down inline here.
match new_slot {
TenantSlot::InProgress(_) => {
// Unreachable because we never insert an InProgress
unreachable!()
}
TenantSlot::Attached(tenant) => {
let (_guard, progress) = utils::completion::channel();
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
match tenant.shutdown(progress, false).await {
Ok(()) => {
info!("Finished shutting down just-spawned tenant");
}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
}
}
}
TenantSlot::Secondary(secondary_tenant) => {
secondary_tenant.shutdown().await;
}
}
TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
})?;
Err(UpsertLocationError::Unavailable(
TenantMapError::ShuttingDown,
))
}
Ok(()) => Ok(attached_tenant),
}
Ok(attached_tenant)
}
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
@@ -1311,7 +1241,6 @@ impl TenantManager {
tenant_shard_id: TenantShardId,
activation_timeout: Duration,
) -> Result<(), DeleteTenantError> {
super::span::debug_assert_current_span_has_tenant_id();
// We acquire a SlotGuard during this function to protect against concurrent
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
// have to return the Tenant to the map while the background deletion runs.
@@ -1799,31 +1728,14 @@ pub(crate) enum TenantSlotError {
/// Superset of TenantMapError: issues that can occur when using a SlotGuard
/// to insert a new value.
#[derive(thiserror::Error)]
pub(crate) enum TenantSlotUpsertError {
#[derive(Debug, thiserror::Error)]
pub enum TenantSlotUpsertError {
/// An error where the slot is in an unexpected state, indicating a code bug
#[error("Internal error updating Tenant")]
InternalError(Cow<'static, str>),
#[error(transparent)]
MapState(TenantMapError),
// If we encounter TenantManager shutdown during upsert, we must carry the Completion
// from the SlotGuard, so that the caller can hold it while they clean up: otherwise
// TenantManager shutdown might race ahead before we're done cleaning up any Tenant that
// was protected by the SlotGuard.
#[error("Shutting down")]
ShuttingDown((TenantSlot, utils::completion::Completion)),
}
impl std::fmt::Debug for TenantSlotUpsertError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::InternalError(reason) => write!(f, "Internal Error {reason}"),
Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"),
Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"),
}
}
MapState(#[from] TenantMapError),
}
#[derive(Debug, thiserror::Error)]
@@ -1872,7 +1784,7 @@ pub struct SlotGuard {
/// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will
/// release any waiters as soon as this SlotGuard is dropped.
completion: utils::completion::Completion,
_completion: utils::completion::Completion,
}
impl SlotGuard {
@@ -1885,7 +1797,7 @@ impl SlotGuard {
tenant_shard_id,
old_value,
upserted: false,
completion,
_completion: completion,
}
}
@@ -1918,16 +1830,9 @@ impl SlotGuard {
}
let m = match &mut *locked {
TenantsMap::Initializing => {
return Err(TenantSlotUpsertError::MapState(
TenantMapError::StillInitializing,
))
}
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
TenantsMap::ShuttingDown(_) => {
return Err(TenantSlotUpsertError::ShuttingDown((
new_value,
self.completion.clone(),
)));
return Err(TenantMapError::ShuttingDown.into());
}
TenantsMap::Open(m) => m,
};
@@ -1975,9 +1880,7 @@ impl SlotGuard {
Err(TenantSlotUpsertError::InternalError(_)) => {
// We already logged the error, nothing else we can do.
}
Err(
TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_),
) => {
Err(TenantSlotUpsertError::MapState(_)) => {
// If the map is shutting down, we need not replace anything
}
Ok(()) => {}
@@ -2075,22 +1978,18 @@ fn tenant_map_peek_slot<'a>(
tenant_shard_id: &TenantShardId,
mode: TenantSlotPeekMode,
) -> Result<Option<&'a TenantSlot>, TenantMapError> {
match tenants.deref() {
TenantsMap::Initializing => Err(TenantMapError::StillInitializing),
let m = match tenants.deref() {
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing),
TenantsMap::ShuttingDown(m) => match mode {
TenantSlotPeekMode::Read => Ok(Some(
// When reading in ShuttingDown state, we must translate None results
// into a ShuttingDown error, because absence of a tenant shard ID in the map
// isn't a reliable indicator of the tenant being gone: it might have been
// InProgress when shutdown started, and cleaned up from that state such
// that it's now no longer in the map. Callers will have to wait until
// we next start up to get a proper answer. This avoids incorrect 404 API responses.
m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?,
)),
TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown),
TenantSlotPeekMode::Read => m,
TenantSlotPeekMode::Write => {
return Err(TenantMapError::ShuttingDown);
}
},
TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)),
}
TenantsMap::Open(m) => m,
};
Ok(m.get(tenant_shard_id))
}
enum TenantSlotAcquireMode {

View File

@@ -257,8 +257,6 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
/// Default buffer size when interfacing with [`tokio::fs::File`].
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
@@ -1068,28 +1066,6 @@ impl RemoteTimelineClient {
Ok(())
}
pub(crate) async fn preserve_initdb_archive(
self: &Arc<Self>,
tenant_id: &TenantId,
timeline_id: &TimelineId,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
backoff::retry(
|| async {
upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
.await
},
|_e| false,
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"preserve_initdb_tar_zst",
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
)
.await
.context("backing up initdb archive")?;
Ok(())
}
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
/// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1125,14 +1101,6 @@ impl RemoteTimelineClient {
let layer_deletion_count = layers.len();
self.deletion_queue_client.push_immediate(layers).await?;
// Delete the initdb.tar.zst, which is not always present, but deletion attempts of
// inexistant objects are not considered errors.
let initdb_path =
remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
self.deletion_queue_client
.push_immediate(vec![initdb_path])
.await?;
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
@@ -1180,8 +1148,10 @@ impl RemoteTimelineClient {
if p == &latest_index {
return false;
}
if p.object_name() == Some(INITDB_PRESERVED_PATH) {
return false;
if let Some(name) = p.object_name() {
if name == INITDB_PATH {
return false;
}
}
true
})
@@ -1754,16 +1724,6 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId
.expect("Failed to construct path")
}
pub fn remote_initdb_preserved_archive_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
) -> RemotePath {
RemotePath::from_string(&format!(
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PRESERVED_PATH}"
))
.expect("Failed to construct path")
}
pub fn remote_index_path(
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,

View File

@@ -32,8 +32,7 @@ use utils::id::TimelineId;
use super::index::{IndexPart, LayerFileMetadata};
use super::{
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
INITDB_PATH,
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
};
///
@@ -431,9 +430,6 @@ pub(crate) async fn download_initdb_tar_zst(
let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
let remote_preserved_path =
remote_initdb_preserved_archive_path(&tenant_shard_id.tenant_id, timeline_id);
let timeline_path = conf.timelines_path(tenant_shard_id);
if !timeline_path.exists() {
@@ -460,16 +456,8 @@ pub(crate) async fn download_initdb_tar_zst(
.with_context(|| format!("tempfile creation {temp_path}"))
.map_err(DownloadError::Other)?;
let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
.await
{
Ok(dl) => dl,
Err(DownloadError::NotFound) => {
download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
.await?
}
Err(other) => Err(other)?,
};
let download =
download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);

View File

@@ -13,8 +13,8 @@ use super::Generation;
use crate::{
config::PageServerConf,
tenant::remote_timeline_client::{
index::IndexPart, remote_index_path, remote_initdb_archive_path,
remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
upload_cancellable,
},
};
use remote_storage::GenericRemoteStorage;
@@ -144,16 +144,3 @@ pub(crate) async fn upload_initdb_dir(
.await
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
}
pub(crate) async fn preserve_initdb_archive(
storage: &GenericRemoteStorage,
tenant_id: &TenantId,
timeline_id: &TimelineId,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
.await
.with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
}

View File

@@ -112,7 +112,7 @@ impl SecondaryTenant {
// on shutdown we walk the tenants and fire their
// individual cancellations?
cancel: CancellationToken::new(),
gate: Gate::default(),
gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
shard_identity,
tenant_conf: std::sync::Mutex::new(tenant_conf),

View File

@@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use crate::tenant::Timeline;
use crate::virtual_file::{self, VirtualFile};
use crate::virtual_file::VirtualFile;
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{bail, ensure, Context, Result};
@@ -649,7 +649,7 @@ impl DeltaLayer {
{
let file = VirtualFile::open_with_options(
path,
virtual_file::OpenOptions::new().read(true).write(true),
&*std::fs::OpenOptions::new().read(true).write(true),
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
@@ -884,7 +884,7 @@ impl DeltaLayerInner {
let keys = self.load_keys(ctx).await?;
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
let val = Value::des(&buf)?;
let desc = match val {
@@ -906,32 +906,13 @@ impl DeltaLayerInner {
for entry in keys {
let DeltaEntry { key, lsn, val, .. } = entry;
let desc = match dump_blob(&val, ctx).await {
let desc = match dump_blob(val, ctx).await {
Ok(desc) => desc,
Err(err) => {
format!("ERROR: {err}")
}
};
println!(" key {key} at {lsn}: {desc}");
// Print more details about CHECKPOINT records. Would be nice to print details
// of many other record types too, but these are particularly interesting, as
// have a lot of special processing for them in walingest.rs.
use pageserver_api::key::CHECKPOINT_KEY;
use postgres_ffi::CheckPoint;
if key == CHECKPOINT_KEY {
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
let val = Value::des(&buf)?;
match val {
Value::Image(img) => {
let checkpoint = CheckPoint::decode(&img)?;
println!(" CHECKPOINT: {:?}", checkpoint);
}
Value::WalRecord(_rec) => {
println!(" unexpected walrecord value for checkpoint key");
}
}
}
}
Ok(())

View File

@@ -34,7 +34,7 @@ use crate::tenant::storage_layer::{
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
};
use crate::tenant::Timeline;
use crate::virtual_file::{self, VirtualFile};
use crate::virtual_file::VirtualFile;
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes;
@@ -327,7 +327,7 @@ impl ImageLayer {
{
let file = VirtualFile::open_with_options(
path,
virtual_file::OpenOptions::new().read(true).write(true),
&*std::fs::OpenOptions::new().read(true).write(true),
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
@@ -492,15 +492,11 @@ impl ImageLayerWriterInner {
},
);
info!("new image layer {path}");
let mut file = {
VirtualFile::open_with_options(
&path,
virtual_file::OpenOptions::new()
.write(true)
.create_new(true),
)
.await?
};
let mut file = VirtualFile::open_with_options(
&path,
std::fs::OpenOptions::new().write(true).create_new(true),
)
.await?;
// make room for the header block
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);

View File

@@ -55,13 +55,13 @@ impl PersistentLayerDesc {
}
#[cfg(test)]
pub fn new_test(key_range: Range<Key>, lsn_range: Range<Lsn>, is_delta: bool) -> Self {
pub fn new_test(key_range: Range<Key>) -> Self {
Self {
tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
timeline_id: TimelineId::generate(),
key_range,
lsn_range,
is_delta,
lsn_range: Lsn(0)..Lsn(1),
is_delta: false,
file_size: 0,
}
}

View File

@@ -9,7 +9,6 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::TENANT_TASK_EVENTS;
use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::timeline::CompactionError;
use crate::tenant::{Tenant, TenantState};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -182,11 +181,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
log_compaction_error(
&e,
error_run_count,
&wait_duration,
cancel.is_cancelled(),
error!(
"Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
);
wait_duration
} else {
@@ -214,58 +210,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
}
fn log_compaction_error(
e: &CompactionError,
error_run_count: u32,
sleep_duration: &std::time::Duration,
task_cancelled: bool,
) {
use crate::tenant::upload_queue::NotInitialized;
use crate::tenant::PageReconstructError;
use CompactionError::*;
enum LooksLike {
Info,
Error,
}
let decision = match e {
ShuttingDown => None,
_ if task_cancelled => Some(LooksLike::Info),
Other(e) => {
let root_cause = e.root_cause();
let is_stopping = {
let upload_queue = root_cause
.downcast_ref::<NotInitialized>()
.is_some_and(|e| e.is_stopping());
let timeline = root_cause
.downcast_ref::<PageReconstructError>()
.is_some_and(|e| e.is_stopping());
upload_queue || timeline
};
if is_stopping {
Some(LooksLike::Info)
} else {
Some(LooksLike::Error)
}
}
};
match decision {
Some(LooksLike::Info) => info!(
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
),
Some(LooksLike::Error) => error!(
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
),
None => {}
}
}
///
/// GC task's main loop
///

View File

@@ -14,7 +14,6 @@ use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
use pageserver_api::{
keyspace::{key_range_size, KeySpaceAccum},
models::{
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
LayerMapInfo, TimelineState,
@@ -33,7 +32,7 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::sync::gate::Gate;
use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::ops::{Deref, Range};
use std::pin::pin;
use std::sync::atomic::Ordering as AtomicOrdering;
@@ -74,8 +73,8 @@ use crate::metrics::{
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
};
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
use crate::tenant::config::TenantConfOpt;
use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardIndex;
@@ -392,7 +391,8 @@ pub(crate) enum PageReconstructError {
#[error("Ancestor LSN wait error: {0}")]
AncestorLsnTimeout(#[from] WaitLsnError),
#[error("timeline shutting down")]
/// The operation was cancelled
#[error("Cancelled")]
Cancelled,
/// The ancestor of this is being stopped
@@ -404,34 +404,6 @@ pub(crate) enum PageReconstructError {
WalRedo(anyhow::Error),
}
impl PageReconstructError {
/// Returns true if this error indicates a tenant/timeline shutdown alike situation
pub(crate) fn is_stopping(&self) -> bool {
use PageReconstructError::*;
match self {
Other(_) => false,
AncestorLsnTimeout(_) => false,
Cancelled | AncestorStopping(_) => true,
WalRedo(_) => false,
}
}
}
#[derive(thiserror::Error, Debug)]
enum CreateImageLayersError {
#[error("timeline shutting down")]
Cancelled,
#[error(transparent)]
GetVectoredError(GetVectoredError),
#[error(transparent)]
PageReconstructError(PageReconstructError),
#[error(transparent)]
Other(#[from] anyhow::Error),
}
#[derive(thiserror::Error, Debug)]
enum FlushLayerError {
/// Timeline cancellation token was cancelled
@@ -439,34 +411,7 @@ enum FlushLayerError {
Cancelled,
#[error(transparent)]
CreateImageLayersError(CreateImageLayersError),
#[error(transparent)]
Other(#[from] anyhow::Error),
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum GetVectoredError {
#[error("timeline shutting down")]
Cancelled,
#[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
Oversized(u64),
#[error("Requested at invalid LSN: {0}")]
InvalidLsn(Lsn),
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum GetReadyAncestorError {
#[error("ancestor timeline {0} is being stopped")]
AncestorStopping(TimelineId),
#[error("Ancestor LSN wait error: {0}")]
AncestorLsnTimeout(#[from] WaitLsnError),
#[error("Cancelled")]
Cancelled,
PageReconstructError(#[from] PageReconstructError),
#[error(transparent)]
Other(#[from] anyhow::Error),
@@ -511,57 +456,6 @@ pub(crate) enum WaitLsnError {
Timeout(String),
}
// The impls below achieve cancellation mapping for errors.
// Perhaps there's a way of achieving this with less cruft.
impl From<CreateImageLayersError> for CompactionError {
fn from(e: CreateImageLayersError) -> Self {
match e {
CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
_ => CompactionError::Other(e.into()),
}
}
}
impl From<CreateImageLayersError> for FlushLayerError {
fn from(e: CreateImageLayersError) -> Self {
match e {
CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
any => FlushLayerError::CreateImageLayersError(any),
}
}
}
impl From<PageReconstructError> for CreateImageLayersError {
fn from(e: PageReconstructError) -> Self {
match e {
PageReconstructError::Cancelled => CreateImageLayersError::Cancelled,
_ => CreateImageLayersError::PageReconstructError(e),
}
}
}
impl From<GetVectoredError> for CreateImageLayersError {
fn from(e: GetVectoredError) -> Self {
match e {
GetVectoredError::Cancelled => CreateImageLayersError::Cancelled,
_ => CreateImageLayersError::GetVectoredError(e),
}
}
}
impl From<GetReadyAncestorError> for PageReconstructError {
fn from(e: GetReadyAncestorError) -> Self {
use GetReadyAncestorError::*;
match e {
AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
Cancelled => PageReconstructError::Cancelled,
Other(other) => PageReconstructError::Other(other),
}
}
}
/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
@@ -681,57 +575,6 @@ impl Timeline {
res
}
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
/// Look up multiple page versions at a given LSN
///
/// This naive implementation will be replaced with a more efficient one
/// which actually vectorizes the read path.
pub(crate) async fn get_vectored(
&self,
key_ranges: &[Range<Key>],
lsn: Lsn,
ctx: &RequestContext,
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
if !lsn.is_valid() {
return Err(GetVectoredError::InvalidLsn(lsn));
}
let key_count = key_ranges
.iter()
.map(|range| key_range_size(range) as u64)
.sum();
if key_count > Timeline::MAX_GET_VECTORED_KEYS {
return Err(GetVectoredError::Oversized(key_count));
}
let _timer = crate::metrics::GET_VECTORED_LATENCY
.for_task_kind(ctx.task_kind())
.map(|t| t.start_timer());
let mut values = BTreeMap::new();
for range in key_ranges {
let mut key = range.start;
while key != range.end {
assert!(!self.shard_identity.is_key_disposable(&key));
let block = self.get(key, lsn, ctx).await;
if matches!(
block,
Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
) {
return Err(GetVectoredError::Cancelled);
}
values.insert(key, block);
key = key.next();
}
}
Ok(values)
}
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
pub fn get_last_record_lsn(&self) -> Lsn {
self.last_record_lsn.load().last
@@ -1087,6 +930,7 @@ impl Timeline {
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
///
/// While we are flushing, we continue to accept read I/O.
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
pub(crate) async fn flush_and_shutdown(&self) {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -1135,8 +979,6 @@ impl Timeline {
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
/// the graceful [`Timeline::flush_and_shutdown`] function.
pub(crate) async fn shutdown(&self) {
span::debug_assert_current_span_has_tenant_and_timeline_id();
// Signal any subscribers to our cancellation token to drop out
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
@@ -1315,13 +1157,6 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
// Private functions
impl Timeline {
pub fn get_lazy_slru_download(&self) -> bool {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
tenant_conf
.lazy_slru_download
.unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
}
fn get_checkpoint_distance(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
tenant_conf
@@ -1530,7 +1365,7 @@ impl Timeline {
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
cancel,
gate: Gate::default(),
gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),
compaction_lock: tokio::sync::Mutex::default(),
gc_lock: tokio::sync::Mutex::default(),
@@ -2427,8 +2262,60 @@ impl Timeline {
timeline.ancestor_lsn,
cont_lsn
);
let ancestor = match timeline.get_ancestor_timeline() {
Ok(timeline) => timeline,
Err(e) => return Err(PageReconstructError::from(e)),
};
timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
// It's possible that the ancestor timeline isn't active yet, or
// is active but hasn't yet caught up to the branch point. Wait
// for it.
//
// This cannot happen while the pageserver is running normally,
// because you cannot create a branch from a point that isn't
// present in the pageserver yet. However, we don't wait for the
// branch point to be uploaded to cloud storage before creating
// a branch. I.e., the branch LSN need not be remote consistent
// for the branching operation to succeed.
//
// Hence, if we try to load a tenant in such a state where
// 1. the existence of the branch was persisted (in IndexPart and/or locally)
// 2. but the ancestor state is behind branch_lsn because it was not yet persisted
// then we will need to wait for the ancestor timeline to
// re-stream WAL up to branch_lsn before we access it.
//
// How can a tenant get in such a state?
// - ungraceful pageserver process exit
// - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
//
// NB: this could be avoided by requiring
// branch_lsn >= remote_consistent_lsn
// during branch creation.
match ancestor.wait_to_become_active(ctx).await {
Ok(()) => {}
Err(TimelineState::Stopping) => {
return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
}
Err(state) => {
return Err(PageReconstructError::Other(anyhow::anyhow!(
"Timeline {} will not become active. Current state: {:?}",
ancestor.timeline_id,
&state,
)));
}
}
ancestor
.wait_lsn(timeline.ancestor_lsn, ctx)
.await
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
WaitLsnError::Shutdown => PageReconstructError::Cancelled,
e @ WaitLsnError::BadState => {
PageReconstructError::Other(anyhow::anyhow!(e))
}
})?;
timeline_owned = ancestor;
timeline = &*timeline_owned;
prev_lsn = Lsn(u64::MAX);
continue 'outer;
@@ -2558,66 +2445,6 @@ impl Timeline {
Some((lsn, img))
}
async fn get_ready_ancestor_timeline(
&self,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, GetReadyAncestorError> {
let ancestor = match self.get_ancestor_timeline() {
Ok(timeline) => timeline,
Err(e) => return Err(GetReadyAncestorError::from(e)),
};
// It's possible that the ancestor timeline isn't active yet, or
// is active but hasn't yet caught up to the branch point. Wait
// for it.
//
// This cannot happen while the pageserver is running normally,
// because you cannot create a branch from a point that isn't
// present in the pageserver yet. However, we don't wait for the
// branch point to be uploaded to cloud storage before creating
// a branch. I.e., the branch LSN need not be remote consistent
// for the branching operation to succeed.
//
// Hence, if we try to load a tenant in such a state where
// 1. the existence of the branch was persisted (in IndexPart and/or locally)
// 2. but the ancestor state is behind branch_lsn because it was not yet persisted
// then we will need to wait for the ancestor timeline to
// re-stream WAL up to branch_lsn before we access it.
//
// How can a tenant get in such a state?
// - ungraceful pageserver process exit
// - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
//
// NB: this could be avoided by requiring
// branch_lsn >= remote_consistent_lsn
// during branch creation.
match ancestor.wait_to_become_active(ctx).await {
Ok(()) => {}
Err(TimelineState::Stopping) => {
return Err(GetReadyAncestorError::AncestorStopping(
ancestor.timeline_id,
));
}
Err(state) => {
return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
"Timeline {} will not become active. Current state: {:?}",
ancestor.timeline_id,
&state,
)));
}
}
ancestor
.wait_lsn(self.ancestor_lsn, ctx)
.await
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
})?;
Ok(ancestor)
}
fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
format!(
@@ -2755,7 +2582,7 @@ impl Timeline {
return;
}
err @ Err(
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
) => {
error!("could not flush frozen layer: {err:?}");
break err;
@@ -2828,12 +2655,12 @@ impl Timeline {
}
/// Flush one frozen in-memory layer to disk, as a new delta layer.
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))]
async fn flush_frozen_layer(
self: &Arc<Self>,
frozen_layer: Arc<InMemoryLayer>,
ctx: &RequestContext,
) -> Result<(), FlushLayerError> {
span::debug_assert_current_span_has_tenant_and_timeline_id();
// As a special case, when we have just imported an image into the repository,
// instead of writing out a L0 delta layer, we directly write out image layer
// files instead. This is possible as long as *all* the data imported into the
@@ -3032,21 +2859,6 @@ impl Timeline {
Ok(())
}
pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
if let Some(remote_client) = &self.remote_client {
remote_client
.preserve_initdb_archive(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
&self.cancel,
)
.await?;
} else {
bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
}
Ok(())
}
// Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
// in layer map immediately. The caller is responsible to put it into the layer map.
async fn create_delta_layer(
@@ -3138,7 +2950,11 @@ impl Timeline {
}
// Is it time to create a new image layer for the given partition?
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
async fn time_for_new_image_layer(
&self,
partition: &KeySpace,
lsn: Lsn,
) -> anyhow::Result<bool> {
let threshold = self.get_image_creation_threshold();
let guard = self.layers.read().await;
@@ -3158,20 +2974,20 @@ impl Timeline {
// but the range is already covered by image layers at more recent LSNs. Before we
// create a new image layer, check if the range is already covered at more recent LSNs.
if !layers
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
{
debug!(
"Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
img_range.start, img_range.end, cutoff_lsn, lsn
);
return true;
return Ok(true);
}
}
}
}
for part_range in &partition.ranges {
let image_coverage = layers.image_coverage(part_range, lsn);
let image_coverage = layers.image_coverage(part_range, lsn)?;
for (img_range, last_img) in image_coverage {
let img_lsn = if let Some(last_img) = last_img {
last_img.get_lsn_range().end
@@ -3192,7 +3008,7 @@ impl Timeline {
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
if img_lsn < lsn {
let num_deltas =
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold));
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
max_deltas = max_deltas.max(num_deltas);
if num_deltas >= threshold {
@@ -3200,7 +3016,7 @@ impl Timeline {
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
img_range.start, img_range.end, num_deltas, img_lsn, lsn
);
return true;
return Ok(true);
}
}
}
@@ -3210,7 +3026,7 @@ impl Timeline {
max_deltas,
"none of the partitioned ranges had >= {threshold} deltas"
);
false
Ok(false)
}
#[tracing::instrument(skip_all, fields(%lsn, %force))]
@@ -3220,7 +3036,7 @@ impl Timeline {
lsn: Lsn,
force: bool,
ctx: &RequestContext,
) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
) -> Result<Vec<ResidentLayer>, PageReconstructError> {
let timer = self.metrics.create_images_time_histo.start_timer();
let mut image_layers = Vec::new();
@@ -3238,7 +3054,7 @@ impl Timeline {
for partition in partitioning.parts.iter() {
let img_range = start..partition.ranges.last().unwrap().end;
start = img_range.end;
if force || self.time_for_new_image_layer(partition, lsn).await {
if force || self.time_for_new_image_layer(partition, lsn).await? {
let mut image_layer_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
@@ -3249,12 +3065,10 @@ impl Timeline {
.await?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
Err(CreateImageLayersError::Other(anyhow::anyhow!(
Err(PageReconstructError::Other(anyhow::anyhow!(
"failpoint image-layer-writer-fail-before-finish"
)))
});
let mut key_request_accum = KeySpaceAccum::new();
for range in &partition.ranges {
let mut key = range.start;
while key < range.end {
@@ -3267,55 +3081,34 @@ impl Timeline {
key = key.next();
continue;
}
key_request_accum.add_key(key);
if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
|| key.next() == range.end
{
let results = self
.get_vectored(
&key_request_accum.consume_keyspace().ranges,
lsn,
ctx,
)
.await?;
for (img_key, img) in results {
let img = match img {
Ok(img) => img,
Err(err) => {
// If we fail to reconstruct a VM or FSM page, we can zero the
// page without losing any actual user data. That seems better
// than failing repeatedly and getting stuck.
//
// We had a bug at one point, where we truncated the FSM and VM
// in the pageserver, but the Postgres didn't know about that
// and continued to generate incremental WAL records for pages
// that didn't exist in the pageserver. Trying to replay those
// WAL records failed to find the previous image of the page.
// This special case allows us to recover from that situation.
// See https://github.com/neondatabase/neon/issues/2601.
//
// Unfortunately we cannot do this for the main fork, or for
// any metadata keys, keys, as that would lead to actual data
// loss.
if is_rel_fsm_block_key(img_key)
|| is_rel_vm_block_key(img_key)
{
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
ZERO_PAGE.clone()
} else {
return Err(
CreateImageLayersError::PageReconstructError(err),
);
}
}
};
image_layer_writer.put_image(img_key, &img).await?;
let img = match self.get(key, lsn, ctx).await {
Ok(img) => img,
Err(err) => {
// If we fail to reconstruct a VM or FSM page, we can zero the
// page without losing any actual user data. That seems better
// than failing repeatedly and getting stuck.
//
// We had a bug at one point, where we truncated the FSM and VM
// in the pageserver, but the Postgres didn't know about that
// and continued to generate incremental WAL records for pages
// that didn't exist in the pageserver. Trying to replay those
// WAL records failed to find the previous image of the page.
// This special case allows us to recover from that situation.
// See https://github.com/neondatabase/neon/issues/2601.
//
// Unfortunately we cannot do this for the main fork, or for
// any metadata keys, keys, as that would lead to actual data
// loss.
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
ZERO_PAGE.clone()
} else {
return Err(err);
}
}
}
};
image_layer_writer.put_image(key, &img).await?;
key = key.next();
}
}
@@ -3691,7 +3484,7 @@ impl Timeline {
// has not so much sense, because largest holes will corresponds field1/field2 changes.
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
// That is why it is better to measure size of hole as number of covering image layers.
let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
if coverage_size >= min_hole_coverage_size {
heap.push(Hole {
key_range,
@@ -4317,7 +4110,7 @@ impl Timeline {
// we cannot remove C, even though it's older than 2500, because
// the delta layer 2000-3000 depends on it.
if !layers
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
{
debug!("keeping {} because it is the latest layer", l.filename());
// Collect delta key ranges that need image layers to allow garbage
@@ -4447,7 +4240,7 @@ impl Timeline {
.walredo_mgr
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
.await
.context("reconstruct a page image")
.context("Failed to reconstruct a page image:")
{
Ok(img) => img,
Err(e) => return Err(PageReconstructError::WalRedo(e)),

View File

@@ -356,14 +356,12 @@ impl DeleteTimelineFlow {
// NB: If this fails half-way through, and is retried, the retry will go through
// all the same steps again. Make sure the code here is idempotent, and don't
// error out if some of the shutdown tasks have already been completed!
#[instrument(skip_all, fields(%inplace))]
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
pub async fn run(
tenant: &Arc<Tenant>,
timeline_id: TimelineId,
inplace: bool,
) -> Result<(), DeleteTimelineError> {
super::debug_assert_current_span_has_tenant_and_timeline_id();
let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
guard.mark_in_progress()?;

View File

@@ -126,27 +126,6 @@ pub(super) struct UploadQueueStopped {
pub(super) deleted_at: SetDeletedFlagProgress,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum NotInitialized {
#[error("queue is in state Uninitialized")]
Uninitialized,
#[error("queue is in state Stopping")]
Stopped,
#[error("queue is shutting down")]
ShuttingDown,
}
impl NotInitialized {
pub(crate) fn is_stopping(&self) -> bool {
use NotInitialized::*;
match self {
Uninitialized => false,
Stopped => true,
ShuttingDown => true,
}
}
}
impl UploadQueue {
pub(crate) fn initialize_empty_remote(
&mut self,
@@ -235,17 +214,17 @@ impl UploadQueue {
}
pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
use UploadQueue::*;
match self {
Uninitialized => Err(NotInitialized::Uninitialized.into()),
Initialized(x) => {
if x.shutting_down {
Err(NotInitialized::ShuttingDown.into())
} else {
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
anyhow::bail!("queue is in state {}", self.as_str())
}
UploadQueue::Initialized(x) => {
if !x.shutting_down {
Ok(x)
} else {
anyhow::bail!("queue is shutting down")
}
}
Stopped(_) => Err(NotInitialized::Stopped.into()),
}
}

View File

@@ -11,28 +11,18 @@
//! src/backend/storage/file/fd.c
//!
use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
use crate::page_cache::PageWriteGuard;
use crate::tenant::TENANTS_SEGMENT_NAME;
use camino::{Utf8Path, Utf8PathBuf};
use once_cell::sync::OnceCell;
use pageserver_api::shard::TenantShardId;
use std::fs::{self, File};
use std::fs::{self, File, OpenOptions};
use std::io::{Error, ErrorKind, Seek, SeekFrom};
use tokio_epoll_uring::IoBufMut;
use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
use std::os::unix::fs::FileExt;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tokio::time::Instant;
use utils::fs_ext;
mod io_engine;
mod open_options;
pub use io_engine::IoEngineKind;
pub(crate) use open_options::*;
///
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
/// the underlying file is closed if the system is low on file descriptors,
@@ -116,38 +106,7 @@ struct SlotInner {
tag: u64,
/// the underlying file
file: Option<OwnedFd>,
}
/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
struct PageWriteGuardBuf {
page: PageWriteGuard<'static>,
init_up_to: usize,
}
// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
fn stable_ptr(&self) -> *const u8 {
self.page.as_ptr()
}
fn bytes_init(&self) -> usize {
self.init_up_to
}
fn bytes_total(&self) -> usize {
self.page.len()
}
}
// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access,
// hence it's safe to hand out the `stable_mut_ptr()`.
unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
fn stable_mut_ptr(&mut self) -> *mut u8 {
self.page.as_mut_ptr()
}
unsafe fn set_init(&mut self, pos: usize) {
assert!(pos <= self.page.len());
self.init_up_to = pos;
}
file: Option<File>,
}
impl OpenFiles {
@@ -315,10 +274,6 @@ macro_rules! with_file {
let $ident = $this.lock_file().await?;
observe_duration!($op, $($body)*)
}};
($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{
let mut $ident = $this.lock_file().await?;
observe_duration!($op, $($body)*)
}};
}
impl VirtualFile {
@@ -371,9 +326,7 @@ impl VirtualFile {
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
// where our caller doesn't get to use the returned VirtualFile before its
// slot gets re-used by someone else.
let file = observe_duration!(StorageIoOperation::Open, {
open_options.open(path.as_std_path()).await?
});
let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
// Strip all options other than read and write.
//
@@ -447,13 +400,15 @@ impl VirtualFile {
/// Call File::sync_all() on the underlying File.
pub async fn sync_all(&self) -> Result<(), Error> {
with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
.with_std_file(|std_file| std_file.sync_all()))
with_file!(self, StorageIoOperation::Fsync, |file| file
.as_ref()
.sync_all())
}
pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
.with_std_file(|std_file| std_file.metadata()))
with_file!(self, StorageIoOperation::Metadata, |file| file
.as_ref()
.metadata())
}
/// Helper function internal to `VirtualFile` that looks up the underlying File,
@@ -462,7 +417,7 @@ impl VirtualFile {
///
/// We are doing it via a macro as Rust doesn't support async closures that
/// take on parameters with lifetimes.
async fn lock_file(&self) -> Result<FileGuard, Error> {
async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
let open_files = get_open_files();
let mut handle_guard = {
@@ -508,9 +463,10 @@ impl VirtualFile {
// NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
// case from StorageIoOperation::Open. This helps with identifying thrashing
// of the virtual file descriptor cache.
let file = observe_duration!(StorageIoOperation::OpenAfterReplace, {
self.open_options.open(self.path.as_std_path()).await?
});
let file = observe_duration!(
StorageIoOperation::OpenAfterReplace,
self.open_options.open(&self.path)
)?;
// Store the File in the slot and update the handle in the VirtualFile
// to point to it.
@@ -535,8 +491,9 @@ impl VirtualFile {
self.pos = offset;
}
SeekFrom::End(offset) => {
self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard
.with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))?
self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
.as_ref()
.seek(SeekFrom::End(offset)))?
}
SeekFrom::Current(offset) => {
let pos = self.pos as i128 + offset as i128;
@@ -555,28 +512,25 @@ impl VirtualFile {
Ok(self.pos)
}
pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
where
B: IoBufMut + Send,
{
let (buf, res) =
read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
res.map(|()| buf)
}
/// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
pub async fn read_exact_at_page(
&self,
page: PageWriteGuard<'static>,
offset: u64,
) -> Result<PageWriteGuard<'static>, Error> {
let buf = PageWriteGuardBuf {
page,
init_up_to: 0,
};
let res = self.read_exact_at(buf, offset).await;
res.map(|PageWriteGuardBuf { page, .. }| page)
.map_err(|e| Error::new(ErrorKind::Other, e))
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
while !buf.is_empty() {
match self.read_at(buf, offset).await {
Ok(0) => {
return Err(Error::new(
std::io::ErrorKind::UnexpectedEof,
"failed to fill whole buffer",
))
}
Ok(n) => {
buf = &mut buf[n..];
offset += n as u64;
}
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
Ok(())
}
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
@@ -626,35 +580,22 @@ impl VirtualFile {
Ok(n)
}
pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
where
B: tokio_epoll_uring::BoundedBufMut + Send,
{
let file_guard = match self.lock_file().await {
Ok(file_guard) => file_guard,
Err(e) => return (buf, Err(e)),
};
observe_duration!(StorageIoOperation::Read, {
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
if let Ok(size) = res {
STORAGE_IO_SIZE
.with_label_values(&[
"read",
&self.tenant_id,
&self.shard_id,
&self.timeline_id,
])
.add(size as i64);
}
(buf, res)
})
pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
let result = with_file!(self, StorageIoOperation::Read, |file| file
.as_ref()
.read_at(buf, offset));
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
.add(size as i64);
}
result
}
async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
});
let result = with_file!(self, StorageIoOperation::Write, |file| file
.as_ref()
.write_at(buf, offset));
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
@@ -664,241 +605,18 @@ impl VirtualFile {
}
}
// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
pub async fn read_exact_at_impl<B, F, Fut>(
buf: B,
mut offset: u64,
mut read_at: F,
) -> (B, std::io::Result<()>)
where
B: IoBufMut + Send,
F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
{
use tokio_epoll_uring::BoundedBuf;
let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
while buf.bytes_total() != 0 {
let res;
(buf, res) = read_at(buf, offset).await;
match res {
Ok(0) => break,
Ok(n) => {
buf = buf.slice(n..);
offset += n as u64;
}
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
Err(e) => return (buf.into_inner(), Err(e)),
}
}
// NB: don't use `buf.is_empty()` here; it is from the
// `impl Deref for Slice { Target = [u8] }`; the the &[u8]
// returned by it only covers the initialized portion of `buf`.
// Whereas we're interested in ensuring that we filled the entire
// buffer that the user passed in.
if buf.bytes_total() != 0 {
(
buf.into_inner(),
Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"failed to fill whole buffer",
)),
)
} else {
assert_eq!(buf.len(), buf.bytes_total());
(buf.into_inner(), Ok(()))
}
struct FileGuard<'a> {
slot_guard: RwLockReadGuard<'a, SlotInner>,
}
#[cfg(test)]
mod test_read_exact_at_impl {
use std::{collections::VecDeque, sync::Arc};
use tokio_epoll_uring::{BoundedBuf, BoundedBufMut};
use super::read_exact_at_impl;
struct Expectation {
offset: u64,
bytes_total: usize,
result: std::io::Result<Vec<u8>>,
}
struct MockReadAt {
expectations: VecDeque<Expectation>,
}
impl MockReadAt {
async fn read_at(
&mut self,
mut buf: tokio_epoll_uring::Slice<Vec<u8>>,
offset: u64,
) -> (tokio_epoll_uring::Slice<Vec<u8>>, std::io::Result<usize>) {
let exp = self
.expectations
.pop_front()
.expect("read_at called but we have no expectations left");
assert_eq!(exp.offset, offset);
assert_eq!(exp.bytes_total, buf.bytes_total());
match exp.result {
Ok(bytes) => {
assert!(bytes.len() <= buf.bytes_total());
buf.put_slice(&bytes);
(buf, Ok(bytes.len()))
}
Err(e) => (buf, Err(e)),
}
}
}
impl Drop for MockReadAt {
fn drop(&mut self) {
assert_eq!(self.expectations.len(), 0);
}
}
#[tokio::test]
async fn test_basic() {
let buf = Vec::with_capacity(5);
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
expectations: VecDeque::from(vec![Expectation {
offset: 0,
bytes_total: 5,
result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
}]),
}));
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
.await;
assert!(res.is_ok());
assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
}
#[tokio::test]
async fn test_empty_buf_issues_no_syscall() {
let buf = Vec::new();
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
expectations: VecDeque::new(),
}));
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
.await;
assert!(res.is_ok());
}
#[tokio::test]
async fn test_two_read_at_calls_needed_until_buf_filled() {
let buf = Vec::with_capacity(4);
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
expectations: VecDeque::from(vec![
Expectation {
offset: 0,
bytes_total: 4,
result: Ok(vec![b'a', b'b']),
},
Expectation {
offset: 2,
bytes_total: 2,
result: Ok(vec![b'c', b'd']),
},
]),
}));
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
.await;
assert!(res.is_ok());
assert_eq!(buf, vec![b'a', b'b', b'c', b'd']);
}
#[tokio::test]
async fn test_eof_before_buffer_full() {
let buf = Vec::with_capacity(3);
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
expectations: VecDeque::from(vec![
Expectation {
offset: 0,
bytes_total: 3,
result: Ok(vec![b'a']),
},
Expectation {
offset: 1,
bytes_total: 2,
result: Ok(vec![b'b']),
},
Expectation {
offset: 2,
bytes_total: 1,
result: Ok(vec![]),
},
]),
}));
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
.await;
let Err(err) = res else {
panic!("should return an error");
};
assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof);
assert_eq!(format!("{err}"), "failed to fill whole buffer");
// buffer contents on error are unspecified
}
}
struct FileGuard {
slot_guard: RwLockReadGuard<'static, SlotInner>,
}
impl AsRef<OwnedFd> for FileGuard {
fn as_ref(&self) -> &OwnedFd {
impl<'a> AsRef<File> for FileGuard<'a> {
fn as_ref(&self) -> &File {
// This unwrap is safe because we only create `FileGuard`s
// if we know that the file is Some.
self.slot_guard.file.as_ref().unwrap()
}
}
impl FileGuard {
/// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
fn with_std_file<F, R>(&self, with: F) -> R
where
F: FnOnce(&File) -> R,
{
// SAFETY:
// - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
// - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut`
let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
let res = with(&file);
let _ = file.into_raw_fd();
res
}
/// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
fn with_std_file_mut<F, R>(&mut self, with: F) -> R
where
F: FnOnce(&mut File) -> R,
{
// SAFETY:
// - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
// - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd
let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
let res = with(&mut file);
let _ = file.into_raw_fd();
res
}
}
impl tokio_epoll_uring::IoFd for FileGuard {
unsafe fn as_fd(&self) -> RawFd {
let owned_fd: &OwnedFd = self.as_ref();
owned_fd.as_raw_fd()
}
}
#[cfg(test)]
impl VirtualFile {
pub(crate) async fn read_blk(
@@ -906,19 +624,16 @@ impl VirtualFile {
blknum: u32,
) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
use crate::page_cache::PAGE_SZ;
let buf = vec![0; PAGE_SZ];
let buf = self
.read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
let mut buf = [0; PAGE_SZ];
self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
.await?;
Ok(crate::tenant::block_io::BlockLease::Vec(buf))
Ok(std::sync::Arc::new(buf).into())
}
async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
let mut tmp = vec![0; 128];
loop {
let res;
(tmp, res) = self.read_at(tmp, self.pos).await;
match res {
let mut tmp = [0; 128];
match self.read_at(&mut tmp, self.pos).await {
Ok(0) => return Ok(()),
Ok(n) => {
self.pos += n as u64;
@@ -994,12 +709,10 @@ impl OpenFiles {
/// Initialize the virtual file module. This must be called once at page
/// server startup.
///
#[cfg(not(test))]
pub fn init(num_slots: usize, engine: IoEngineKind) {
pub fn init(num_slots: usize) {
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
panic!("virtual_file::init called twice");
}
io_engine::init(engine);
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
}
@@ -1044,10 +757,10 @@ mod tests {
}
impl MaybeVirtualFile {
async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
match self {
MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset),
}
}
async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
@@ -1089,14 +802,14 @@ mod tests {
// Helper function to slurp a portion of a file into a string
async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
let buf = vec![0; len];
let buf = self.read_exact_at(buf, pos).await?;
let mut buf = vec![0; len];
self.read_exact_at(&mut buf, pos).await?;
Ok(String::from_utf8(buf).unwrap())
}
}
#[tokio::test]
async fn test_virtual_files() -> anyhow::Result<()> {
async fn test_virtual_files() -> Result<(), Error> {
// The real work is done in the test_files() helper function. This
// allows us to run the same set of tests against a native File, and
// VirtualFile. We trust the native Files and wouldn't need to test them,
@@ -1112,17 +825,14 @@ mod tests {
}
#[tokio::test]
async fn test_physical_files() -> anyhow::Result<()> {
async fn test_physical_files() -> Result<(), Error> {
test_files("physical_files", |path, open_options| async move {
Ok(MaybeVirtualFile::File({
let owned_fd = open_options.open(path.as_std_path()).await?;
File::from(owned_fd)
}))
Ok(MaybeVirtualFile::File(open_options.open(path)?))
})
.await
}
async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
where
OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
@@ -1266,11 +976,11 @@ mod tests {
for _threadno in 0..THREADS {
let files = files.clone();
let hdl = rt.spawn(async move {
let mut buf = vec![0u8; SIZE];
let mut buf = [0u8; SIZE];
let mut rng = rand::rngs::OsRng;
for _ in 1..1000 {
let f = &files[rng.gen_range(0..files.len())];
buf = f.read_exact_at(buf, 0).await.unwrap();
f.read_exact_at(&mut buf, 0).await.unwrap();
assert!(buf == SAMPLE);
}
});

View File

@@ -1,114 +0,0 @@
//! [`super::VirtualFile`] supports different IO engines.
//!
//! The [`IoEngineKind`] enum identifies them.
//!
//! The choice of IO engine is global.
//! Initialize using [`init`].
//!
//! Then use [`get`] and [`super::OpenOptions`].
#[derive(
Copy,
Clone,
PartialEq,
Eq,
Hash,
strum_macros::EnumString,
strum_macros::Display,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
Debug,
)]
#[strum(serialize_all = "kebab-case")]
pub enum IoEngineKind {
StdFs,
#[cfg(target_os = "linux")]
TokioEpollUring,
}
static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
#[cfg(not(test))]
pub(super) fn init(engine: IoEngineKind) {
if IO_ENGINE.set(engine).is_err() {
panic!("called twice");
}
crate::metrics::virtual_file_io_engine::KIND
.with_label_values(&[&format!("{engine}")])
.set(1);
}
pub(super) fn get() -> &'static IoEngineKind {
#[cfg(test)]
{
let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
Ok(v) => match v.parse::<IoEngineKind>() {
Ok(engine_kind) => engine_kind,
Err(e) => {
panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
}
},
Err(std::env::VarError::NotPresent) => {
crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
.parse()
.unwrap()
}
Err(std::env::VarError::NotUnicode(_)) => {
panic!("env var {env_var_name} is not unicode");
}
})
}
#[cfg(not(test))]
IO_ENGINE.get().unwrap()
}
use std::os::unix::prelude::FileExt;
use super::FileGuard;
impl IoEngineKind {
pub(super) async fn read_at<B>(
&self,
file_guard: FileGuard,
offset: u64,
mut buf: B,
) -> ((FileGuard, B), std::io::Result<usize>)
where
B: tokio_epoll_uring::BoundedBufMut + Send,
{
match self {
IoEngineKind::StdFs => {
// SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
let dst = unsafe {
std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
};
let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
if let Ok(nbytes) = &res {
assert!(*nbytes <= buf.bytes_total());
// SAFETY: see above assertion
unsafe {
buf.set_init(*nbytes);
}
}
#[allow(dropping_references)]
drop(dst);
((file_guard, buf), res)
}
#[cfg(target_os = "linux")]
IoEngineKind::TokioEpollUring => {
let system = tokio_epoll_uring::thread_local_system().await;
let (resources, res) = system.read(file_guard, offset, buf).await;
(
resources,
res.map_err(|e| match e {
tokio_epoll_uring::Error::Op(e) => e,
tokio_epoll_uring::Error::System(system) => {
std::io::Error::new(std::io::ErrorKind::Other, system)
}
}),
)
}
}
}
}

Some files were not shown because too many files have changed in this diff Show More